In [29]:
import itertools
import json
import math
import sys
import time

import numpy as np
import gym
import tqdm
from matplotlib.pyplot import subplot, plot, legend, show

env = gym.make('Pendulum-v1', g=9.81)

In [28]:
coords = np.array([(np.cos(np.deg2rad(theta)), np.sin(np.deg2rad(theta))) for theta in range(0, 360, 10)])
x = coords[:, 0]
y = coords[:, 1]

# create list of velocity from -8 to 8 in steps of 0.5
velocities = np.arange(-8, 8, 1)

# combine all possible combinations of coords and velocity
possible_states = []
for i, j in itertools.product(coords, velocities):
    x = i[0]
    y = i[1]
    possible_states.append((x, y, j))

possible_states = np.array(possible_states) # 576 possible states
possible_actions = np.arange(-2.0, 2.0, 0.1)

In [47]:
def descretize_state(state, possible_states):
    index = np.argmin(np.linalg.norm(possible_states - state, axis=1))
    return index, possible_states[index]

def low_pass(data, alpha=0.99):
    low_pass = [data[0]]
    for i in range(1,len(data)):
        low_pass.append(alpha*low_pass[-1] + (1.0-alpha)*data[i] )
    return low_pass

def q_learn(max_iteration, max_steps, learning_rate, gamma, possible_states, possible_actions):
    Q = np.zeros([len(possible_states),len(possible_actions)])
    training_loss = []
    for i in range(max_iteration):
        state = env.reset()
        s_idx, state = descretize_state(state, possible_states)
        reward_sum = 0.0

        for _ in range(max_steps):
            action_list = Q[s_idx,:]
            fuzzy_actions = action_list + np.random.randn(1,len(possible_actions))*(1. / (i + 1))
            selected_action = np.argmax(fuzzy_actions)
            env.state = state
            new_state, reward, done, _ = env.step(possible_states[selected_action])
            next_sidx, next_state = descretize_state(new_state, possible_states)
            Q[s_idx,selected_action] = Q[s_idx,selected_action] + learning_rate*(reward + gamma * np.max(Q[next_sidx,:]) - Q[s_idx,selected_action])
            s_idx = next_sidx
            reward_sum += reward
            if done:
                break
            training_loss.append(reward_sum)
        return Q, training_loss

max_steps = [1e2, 1e3, 1e4]
max_iters = [1e2, 1e4, 1e6]
gammas = [0.1, 0.6, 0.9]
learning_rates = [0.1, 0.5, 0.9]
hyperparameters = list(itertools.product(max_steps, max_iters, gammas, learning_rates))

for args in tqdm.tqdm(hyperparameters, desc="Experiment Iterations"):
    max_steps, max_iteration, gamma, learning_rate = args
    start_time = time.process_time()
    Q, training_loss = q_learn(int(max_steps), int(max_iteration), learning_rate, gamma, possible_states, possible_actions)
    wall_time = time.process_time() - start_time
    results = {
        'max_iteration': max_iteration,
        'max_steps': max_steps,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'wall_time': wall_time,
        'training_loss': training_loss,
        'Q': Q.tolist()
    }
    with open(f'artifacts/pendulum_q_learning/{time.time_ns()}.json', 'w') as f:
        json.dump(results, f)

Experiment Iterations:   0%|          | 0/81 [00:00<?, ?it/s]


NameError: name 'policy' is not defined