In [None]:
import sys, os

sys.path.insert(0, os.path.abspath(".."))
import numpy as np
from itertools import product

from sailing import SailingGridworld
from common import helper as h


In [None]:
gamma = 0.9
eps = 1e-4  # TODO: use this value for Task 2 and Task 3
value_update_iter = 100  # TODO: change this in Task 2
rock_penalty = -2  # TODO: change this in Q1.3


In [None]:
# Set up the environment
env = SailingGridworld(rock_penalty=rock_penalty, value_update_iter=value_update_iter)

In [None]:
def get_values_policy(iterations):
    v_est = np.zeros((env.w, env.h))
    policy = np.zeros((env.w, env.h))
    env.draw_values_policy(v_est, policy)

    for i in range(iterations):
        # TODO: Task 1, implement the value iteration and policy
        # TODO: Task 2, convergency of the value function and policy

        ########## Your code starts here ##########
        # Estimate new state values and calculate policy
        v_est_old = v_est.copy()
        policy_old = policy.copy()
        for x in range(env.w):
            for y in range(env.h):
                direction_vals = {}
                for direction in [env.UP, env.DOWN, env.RIGHT, env.LEFT]:
                    # Transitions, e.g.:
                    # [
                    #      Transition(state=(1, 0), reward=0.0, done=0.0, prob=0.05),
                    #      Transition(state=(0, 0), reward=0.0, done=0.0, prob=0.05),
                    #      Transition(state=(0, 1), reward=0.0, done=0.0, prob=0.9)
                    # ]
                    transits = env.transitions[x, y, direction]
                    direction_val = 0
                    for transit in transits:
                        # Set to reward if future state does not exist
                        # Transits contains only one transit with probability 1
                        if transit.state is None:
                            direction_val += 0
                        else:
                            # x and y transition
                            xt, yt = transit.state
                            direction_val += transit.prob * (
                                transit.reward + (gamma * v_est_old[xt][yt])
                            )
                    direction_vals[direction] = direction_val

                # direction_vals: {0: 7.0, 1: 8.0, 2: 6.0, 3: 6.0}
                max_act = max(direction_vals, key=direction_vals.get)  # e.g. 1
                max_val = max(direction_vals.values())  # e.g. 8.0
                policy[x][y] = max_act
                v_est[x][y] = max_val

        # Maximum change: should be lower than eps
        v_diff = abs(v_est - v_est_old)
        v_diff_max = v_diff.max()
        # print("Value convergence", v_diff_max, v_diff_max < eps)
        # print("Policy convergence", abs(policy_old-policy).max())
        print("End iteration", i + 1)
        # env.draw_values_policy(v_est, policy)
        np.set_printoptions(formatter={"float": lambda x: "{0:0.1f}".format(x)})
        # print("POLICY")
        # print(policy)
        # print("VALUE FUNCTION")
        # print(v_est)
        ########## Your code ends here ##########
    return v_est, policy


# value iteration -- update value estimation and policy
value_est, policy = get_values_policy(iterations=value_update_iter)


In [None]:
# Eval policy
N = 1000  # TODO: change for task 4
discounted_rewards = []

for ep in range(N):
    state = env.reset()
    done = False
    step = 0
    while not done:

        ########## You code starts here ##########

        # TODO: Use the policy to take the optimal action (Task 1)
        action = policy[state]

        # Take a step in the environment
        state, reward, done, _ = env.step(action)

        # TODO: Calculate discounted return for the initial state
        discounted_reward = (gamma**step) * reward
        if discounted_reward != 0:
            print(ep, discounted_reward)
            discounted_rewards.append(discounted_reward)

        step += 1
        ########## You code ends here ##########
    
        # Comment out the line below to disable rendering and make computations faster
        # env.render()

In [None]:
discounted_rewards
print("Mean:", np.mean(discounted_rewards))
print("Standard deviation:", np.std(discounted_rewards))

In [None]:
# save value and policy
h.save_object({"value": value_est, "policy": policy}, "./value_policy.pkl")
