In [9]:
import itertools
import json
import time

import gym
import numpy as np
import tqdm

env = gym.make('Pendulum-v1', g=9.81)

In [10]:
coords = np.array([(np.cos(np.deg2rad(theta)), np.sin(np.deg2rad(theta))) for theta in range(0, 360, 10)])
x = coords[:, 0]
y = coords[:, 1]

# create list of velocity from -8 to 8 in steps of 0.5
velocities = np.arange(-8, 8, 1)

# combine all possible combinations of coords and velocity
possible_states = []
for i, j in itertools.product(coords, velocities):
    x = i[0]
    y = i[1]
    possible_states.append((x, y, j))
possible_states = np.array(possible_states) # 576 possible states
possible_actions = np.arange(-2.0, 2.0, 0.1)

In [18]:
def descretize_state(state, possible_states):
    # state = (0.9, 0.4, 3.3)
    # closest coord according to l2 distance
    index = np.argmin(np.linalg.norm(possible_states - state, axis=1))
    return index, possible_states[index]

def value_iteration(env, possible_actions, possible_states, gamma, epsilon, max_iteration):
    v = np.zeros(len(possible_states))
    deltas = []
    for i in range(max_iteration):
        prev_v = np.copy(v)
        for i, state in enumerate(possible_states):
            q_sa = []
            for j, action in enumerate(possible_actions):
                env.reset()
                env.state = state
                next_state, reward, done, _ = env.step([action])
                idx, next_state = descretize_state(next_state, possible_states)
                q_sa.append(reward + gamma * prev_v[idx])
            v[i] = max(q_sa)
        delta = np.sum(np.fabs(prev_v - v))
        deltas.append(delta)
        if delta <= epsilon:
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break
    return v, deltas

def policy_iteration(V, env, possible_actions, possible_states, gamma):
    new_policy = np.zeros(len(possible_states))
    for i, state in enumerate(possible_states):
        new_v = np.zeros(len(possible_actions))
        for j, action in enumerate(possible_actions):
            env.reset()
            env.state = state
            next_state, reward, done, info = env.step([action])
            idx, next_state = descretize_state(next_state, possible_states)
            new_v[j] += reward + gamma * V[idx]
        new_policy[i] = int(np.argmax(new_v))
    return new_policy.astype(np.int32)

def test_policy(env, policy, gamma,  n = 100):
    def run_episode(env, policy, gamma):
        obs = env.reset()
        state_index, state = descretize_state(obs, possible_states)
        total_reward = 0
        step_idx = 0
        while True:
            action = policy[state_index]
            obs, reward, done , _ = env.step([action])
            state_index, state = descretize_state(obs, possible_states)
            total_reward += (gamma ** step_idx * reward)
            step_idx += 1
            if done:
                break
        return total_reward
    scores = [run_episode(env, policy, gamma) for _ in range(n)]
    return np.mean(scores)

def run():
    max_iterations = [1e1, 1e2, 1e4, 1e6]
    gammas = [0.1, 0.6, 0.9]
    epsilons = [1e-1, 1e-3, 1e-5]
    hyperparameters = list(itertools.product(max_iterations, gammas, epsilons))
    for params in tqdm.tqdm(hyperparameters, 'Training'):
        max_iter, gamma, epsilon = params

        start_time = time.process_time()
        V, deltas = value_iteration(env, possible_actions, possible_states, gamma, epsilon, int(max_iter))
        value_iteration_time = time.process_time() - start_time

        start_time = time.process_time()
        policy = policy_iteration(V, env, possible_actions, possible_states, gamma)
        policy_time = time.process_time() - start_time

        env.reset()
        start_time = time.process_time()
        average_policy_score = test_policy(env, policy, gamma, n=1000)
        policy_score_time = start_time - time.process_time()

        result = {
            'average_policy_score': average_policy_score,
            'policy': policy.tolist(),
            'V': V.tolist(),
            'deltas': deltas,
            'max_iterations': max_iter,
            'gamma': gamma,
            'epsilon': epsilon,
            'value_iteration_wall_time': value_iteration_time,
            'policy_time': policy_time,
            'policy_score_time': policy_score_time
        }

        with open(f'artifacts/pendulum_value_iteration/{time.time_ns()}.json', 'w') as f:
            json.dump(result, f)

run()

Training:   7%|▋         | 2/27 [01:01<12:43, 30.55s/it]


KeyboardInterrupt: 