In [37]:
from Frozen_Lake import FrozenLakeEnv
import numpy as np
import time
from tqdm.notebook import tqdm

In [2]:
env = FrozenLakeEnv()

In [3]:
all_states = env.get_all_states()

In [4]:
for state in all_states:
    print(env.get_possible_actions(state))

('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
()
('left', 'down', 'right', 'up')
()
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
()
()
('left', 'down', 'right', 'up')
('left', 'down', 'right', 'up')
()


In [5]:
env.render()

*FFF
FHFH
FFFH
HFFG



In [6]:
def init_policy(env):
    policy = {}
    for state in env.get_all_states():
        policy[state] = {}
        for action in env.get_possible_actions(state):
            policy[state][action] = 1 / len(env.get_possible_actions(state))
    return policy

In [7]:
policy = init_policy(env)

In [8]:
def policy_evaluation_step(policy, values, gamma):
    q_values = get_q_values(values, gamma)
    new_values = {}
    for state in env.get_all_states():
        new_values[state] = 0
        for action in env.get_possible_actions(state):
            new_values[state] += policy[state][action] * q_values[state][action]
    return new_values

In [9]:
def init_values():
    return {state: 0 for state in env.get_all_states()}

In [10]:
values = init_values()

In [11]:
def get_q_values(values, gamma):
    q_values = {}
    for state in env.get_all_states():
        q_values[state] = {}
        for action in env.get_possible_actions(state):
            q_values[state][action] = 0
            for next_state in env.get_next_states(state, action):
                reward = env.get_reward(state, action, next_state)
                transition_prob = env.get_transition_prob(state, action, next_state)
                next_value = values[next_state]
                q_values[state][action] += reward + gamma * transition_prob  * next_value
                
    return q_values

In [12]:
values = policy_evaluation_step(policy, values, gamma=0.9)
values

{(0, 0): 0.0,
 (0, 1): 0.0,
 (0, 2): 0.0,
 (0, 3): 0.0,
 (1, 0): 0.0,
 (1, 1): 0,
 (1, 2): 0.0,
 (1, 3): 0,
 (2, 0): 0.0,
 (2, 1): 0.0,
 (2, 2): 0.0,
 (2, 3): 0,
 (3, 0): 0,
 (3, 1): 0.0,
 (3, 2): 0.75,
 (3, 3): 0}

In [13]:
def policy_evaluation(policy, values, gamma, evaluation_step_n):
    
    for _ in range(evaluation_step_n):
        values = policy_evaluation_step(policy, values, gamma)
    q_values = get_q_values(values, gamma)
    return q_values

In [23]:
def policy_evaluation_with_resetting(policy, gamma, evaluation_step_n):
    values = init_values()
    for _ in range(evaluation_step_n):
        values = policy_evaluation_step(policy, values, gamma)
    q_values = get_q_values(values, gamma)
    return q_values

In [14]:
q_values = policy_evaluation(policy, values, gamma=0.9, evaluation_step_n=100)
q_values

{(0, 0): {'left': 0.012694672242094042,
  'down': 0.01686835383364476,
  'right': 0.01214429542378969,
  'up': 0.012019806754971888},
 (0, 1): {'left': 0.010810946369245896,
  'down': 0.003926884642953772,
  'right': 0.02288425734126018,
  'up': 0.013047390910457366},
 (0, 2): {'left': 0.0189486317796474,
  'down': 0.059132792337086376,
  'right': 0.01872347762670558,
  'up': 0.023996176355580403},
 (0, 3): {'left': 0.022856113072142453,
  'down': 0.003829943271548001,
  'right': 0.010007271128881995,
  'up': 0.012725295386109774},
 (1, 0): {'left': 0.020770851485299637,
  'down': 0.042155416251302716,
  'right': 0.00625142132081882,
  'up': 0.01148581185636805},
 (1, 1): {},
 (1, 2): {'left': 0.031600450021849295,
  'down': 0.23105940611697212,
  'right': 0.031600450021849295,
  'up': 0.02174419405782223},
 (1, 3): {},
 (2, 0): {'left': 0.042155416251302716,
  'down': 0.0205964531631862,
  'right': 0.1262460665953071,
  'up': 0.03511588332766702},
 (2, 1): {'left': 0.07554391068366573

In [20]:
def policy_improvement(q_values):
    new_policy = {}
    for state in env.get_all_states():
        new_policy[state] = {}
        max_action = None
        max_q_value = float('-inf')
        for action in env.get_possible_actions(state):
            if q_values[state][action] > max_q_value:
                max_q_value = q_values[state][action]
                max_action = action
        for action in env.get_possible_actions(state):
            new_policy[state][action] = 1 if action == max_action else 0
    return new_policy

In [16]:
policy_improvement(q_values)

{(0, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (0, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 3): {'left': 1, 'down': 0, 'right': 0, 'up': 0},
 (1, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 1): {},
 (1, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 3): {},
 (2, 0): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (2, 1): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 3): {},
 (3, 0): {},
 (3, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (3, 3): {}}

In [39]:
def train(epochs, evaluation_step_n, gamma, env, reset_values=False):
    policy = init_policy(env)
    for epoch in range(epochs):
        if reset_values:
            q_values = policy_evaluation_with_resetting(policy, gamma, evaluation_step_n)
        else: 
            q_values = policy_evaluation(policy, values, gamma, evaluation_step_n)
        policy = policy_improvement(q_values)

    return policy



def test(policy, vizualize=False):
    total_reward = 0
    state = env.reset()
    for _ in range(100):
        action = np.random.choice(env.get_possible_actions(state), p=list(policy[state].values()))
        state, reward, done, _ = env.step(action)
        if vizualize:
            env.render()
        total_reward += reward

        if done:
            break

    return total_reward

In [40]:

epochs = 20
evaluation_step_n = 100
gamma = 0.9

policy_with_resetting = train(epochs, evaluation_step_n, gamma, env, reset_values=False)
policy_without_resetting = train(epochs, evaluation_step_n, gamma, env, reset_values=False)


rewards_with_resetting = [test(policy_with_resetting) for _ in tqdm(range(10000))]
rewards_without_resetting = [test(policy_without_resetting) for _ in tqdm(range(10000))]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [46]:
print(f'Mean reward for 10_000 tries with resetting [{np.mean(rewards_with_resetting)}]')
print(f'Mean reward for 10_000 tries with resetting [{np.mean(rewards_without_resetting)}]')

Mean reward for 10_000 tries with resetting [0.7373]
Mean reward for 10_000 tries with resetting [0.7452]
