In [22]:
import json
import time
import gym
import tqdm
import itertools
from pylab import *

In [21]:
def low_pass(data, alpha=0.99):
    low_pass = [data[0]]
    for i in range(1,len(data)):
        low_pass.append(alpha*low_pass[-1] + (1.0-alpha)*data[i] )
    return low_pass


def q_learn(max_iteration, max_steps, learning_rate, gamma):
    Q = np.zeros([env.observation_space.n,env.action_space.n])
    reward_list = []
    for i in range(max_iteration):
        state = env.reset()
        reward_sum = 0.0
        for _ in range(max_steps):
            action_list = Q[state,:]
            fuzzy_actions = action_list + np.random.randn(1,env.action_space.n)*(1. / (i + 1))
            selected_action = np.argmax(fuzzy_actions)
            new_state, reward, done, _ = env.step(selected_action)
            Q[state,selected_action] = Q[state,selected_action] + learning_rate*(reward + gamma * np.max(Q[new_state,:]) - Q[state,selected_action])
            state = new_state
            reward_sum += reward
            if done:
                break
        reward_list.append(reward_sum)
    return reward_list, Q

env = gym.make('FrozenLake8x8-v1')

max_steps = [1e2, 1e3, 1e4]
max_iters = [1e2, 1e4, 1e6]
gammas = [0.1, 0.6, 0.9]
learning_rates = [0.1, 0.5, 0.9]
hyperparameters = list(itertools.product(max_steps, max_iters, gammas, learning_rates))

for args in tqdm.tqdm(hyperparameters, desc="Experiment Iterations"):
    env.reset()
    max_steps, max_iteration, gamma, learning_rate = args
    start_time = time.process_time()
    reward_list, Q = q_learn(int(max_steps), int(max_iteration), learning_rate, gamma)
    wall_time = time.process_time() - start_time
    results = {
        'max_iteration': max_iteration,
        'max_steps': max_steps,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'wall_time': wall_time,
        'training_loss': reward_list,
        'Q': Q.tolist()
    }
    with open(f'artifacts/frozen_lake_q_learning/{time.time_ns()}.json', 'w') as f:
        json.dump(results, f)

Experiment Iterations:  35%|███▍      | 28/81 [00:02<00:05,  9.71it/s]


KeyboardInterrupt: 