In [4]:
import itertools
import json
import time

import gym
import numpy as np
import tqdm


def test_policy(env, policy, gamma,  n = 100):
    def run_episode(env, policy, gamma):
        obs = env.reset()
        total_reward = 0
        step_idx = 0
        while True:
            obs, reward, done , _ = env.step(int(policy[obs]))
            total_reward += (gamma ** step_idx * reward)
            step_idx += 1
            if done:
                break
        return total_reward
    scores = [run_episode(env, policy, gamma) for _ in range(n)]
    return np.mean(scores)

def extract_policy(v, gamma):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy


def value_iteration(env, gamma, epsilon, max_iterations):
    V = np.zeros(env.nS)  # initialize value-function
    differences = []
    for i in range(max_iterations):
        prev_v = np.copy(V)
        for s in range(env.nS):
            q_sa = [sum([p*(r + (gamma * prev_v[s_])) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)]
            V[s] = max(q_sa)
        total_diff = np.sum(np.fabs(prev_v - V))
        differences.append(total_diff)
        if total_diff <= epsilon:
            break
    return differences, V

max_iterations = [1e1, 1e2, 1e4, 1e6]
gammas = [0.1, 0.6, 0.9]
epsilons = [1e-1, 1e-3, 1e-5]
hyperparameters = list(itertools.product(max_iterations, gammas, epsilons))

env = gym.make('FrozenLake8x8-v1')
for args in tqdm.tqdm(hyperparameters, desc="Experiment Iterations"):
    env.reset()
    max_iteration, gamma, epsilon = args

    start_time = time.process_time()
    differences, V = value_iteration(env, gamma, epsilon=epsilon, max_iterations=int(max_iteration))
    value_iteration_wall_time = time.process_time() - start_time

    start_time = time.process_time()
    policy = extract_policy(V, gamma)
    extract_policy_wall_time = time.process_time() - start_time

    average_policy_score = test_policy(env, policy, gamma)

    result = {
        'average_policy_score': average_policy_score,
        'policy': policy.tolist(),
        'V': V.tolist(),
        'differences': differences,
        'value_iteration_wall_time': value_iteration_wall_time,
        'extract_policy_wall_time': extract_policy_wall_time,
        'gamma': gamma,
        'epsilon': epsilon,
        'max_iterations': max_iteration
    }

    with open(f'artifacts/frozen_lake_value_iteration/{time.time_ns()}.json', 'w') as f:
        json.dump(result, f)

Experiment Iterations: 100%|██████████| 27/27 [00:02<00:00, 11.72it/s]
