In [105]:
import itertools
import json
import math
import time
from typing import Dict

import numpy as np
import gym
import tqdm

In [82]:
coords = np.array([(np.cos(np.deg2rad(theta)), np.sin(np.deg2rad(theta))) for theta in range(0, 360, 10)])
x = coords[:, 0]
y = coords[:, 1]

# create list of velocity from -8 to 8 in steps of 0.5
velocities = np.arange(-8, 8, 1)

# combine all possible combinations of coords and velocity
possible_states = []
for i, j in itertools.product(coords, velocities):
    x = i[0]
    y = i[1]
    possible_states.append((x, y, j))
possible_states = np.array(possible_states) # 576 possible states
possible_actions = np.arange(-2.0, 2.0, 0.1)

def descretize_state(state, possible_states):
    index = np.argmin(np.linalg.norm(possible_states - state, axis=1))
    return index, possible_states[index]


def policy_evaluation(policy, env, possible_states, max_policy_iteration, gamma, epsilon):
    optimal_policy_found = True
    V = np.zeros(len(possible_states))
    new_V = np.zeros(len(possible_states))
    differences = []

    # because it's deterministic, no need to loop more than once
    for _ in range(max_policy_iteration):
        new_V = np.zeros(len(possible_states))
        for state_index, state in enumerate(possible_states):
            # Compute state value
            env.reset()
            env.state = state

            # for every possible state accessable by the current state
            discounted_reward_sum = V[state_index]
            action = policy[state_index]
            next_state, reward, done, _ = env.step([action])
            next_state_index, next_state_value = descretize_state(next_state, possible_states)
            # discounted downstream values
            discounted_reward_sum += reward + gamma * V[next_state_index]

            new_V[state_index] = discounted_reward_sum

        # Check convergence
        max_difference = np.max(np.abs(new_V-V))
        differences.append(max_difference)
        if max_difference < epsilon:
            print('value converged')
            break

        # update value and policy
        V = new_V

    return differences, V

def dict_argmax(dictionary: Dict):
    max_value = max(dictionary.values()) # TODO handle multiple keys with the same max value
    for key, value in dictionary.items():
        if value == max_value:
            return key

def policy_improvement(V, env, possible_actions, possible_states, gamma):
    new_policy = dict()
    for state_index, state in enumerate(possible_states):
        action_values = np.zeros(len(possible_actions))
        for action_index, action in enumerate(possible_actions):
            env.reset()
            env.state = state
            next_state, reward, done, info = env.step([action])
            next_state_index, next_state = descretize_state(next_state, possible_states)

            action_values[action_index] += reward + (gamma * V[next_state_index])
        new_policy[state_index] = np.max(action_values)

    return new_policy

def l2(a, b):
    s = 0
    for i in range(len(a)):
        s += (a[i]-b[i])**2
    return math.sqrt(s)

def test_policy(env, policy, gamma, n=100):
    def run_episode(env, policy, gamma):
        obs = env.reset()
        state_index, state = descretize_state(obs, possible_states)
        total_reward = 0
        step_idx = 0
        while True:
            action = policy[state_index]
            obs, reward, done, _ = env.step([action])
            state_index, state = descretize_state(obs, possible_states)
            total_reward += (gamma ** step_idx * reward)
            step_idx += 1
            if done:
                break
        return total_reward

    scores = [run_episode(env, policy, gamma) for _ in range(n)]
    return np.mean(scores)

max_policy_iterations = [1e2, 1e4, 1e6]
gammas = [0.1, 0.6, 0.9]
epsilons = [0.1, 0.001, 0.00001]
n_iterations = [1e2, 1e3, 1e4]
hyperparameters = list(itertools.product(max_policy_iterations, gammas, epsilons, n_iterations))

def run():
    env = gym.make('Pendulum-v1', g=9.81)
    for max_policy_iteration, gamma, epsilon, n_iteration in tqdm.tqdm(hyperparameters, desc="Policy Iteration Experiment"):
        iteration_differences = []
        policy_eval_differences = []
        env.reset()
        # policy = np.zeros(len(possible_states), int)
        policy = {state_index: 0 for state_index in range(len(possible_states))}

        start_time = time.process_time()
        differences, V = policy_evaluation(policy, env, possible_states, max_policy_iteration=int(max_policy_iteration), gamma=gamma, epsilon=epsilon)
        policy_eval_differences.append(differences)
        for _ in range(int(n_iteration)):
            policy = policy_improvement(V, env, possible_actions, possible_states, gamma)
            differences, new_V = policy_evaluation(policy, env, possible_states, max_policy_iteration=int(max_policy_iteration), gamma=gamma, epsilon=epsilon)
            policy_eval_differences.append(differences)
            iteration_differences.append(l2(V, new_V))
            if iteration_differences[-1] == 0:
                break
            V = new_V
        wall_time = time.process_time() - start_time

        average_test_score = test_policy(env, policy, gamma)

        result = {
            'iteration_differences': iteration_differences,
            'policy_evaluation_differences': policy_eval_differences,
            'V': V.tolist(),
            'policy': policy,
            'n_iterations': n_iteration,
            'gamma': gamma,
            'max_policy_iteration': max_policy_iteration,
            'epsilon': epsilon,
            'wall_time': wall_time,
            'average_test_score': average_test_score
        }

        with open(f'artifacts/pendulum_policy_iteration/{time.time_ns()}.json', 'w') as f:
            json.dump(result, f)

run()