In [1]:
import gym
from scipy.special import expit as sigmoid
import numpy as np
import numpy.random as npr
import time
from itertools import count
from collections import deque
import matplotlib.pyplot as plt
from a2c_ppo_acktr import utils
import random

from heap import Heap

In [2]:
class args(object):
    bandwidth = 3
    postwidth = 3
    threshold = 0.1
    slip_reward = -0.001
    signal_split = 1
    fps = 3
    log_interval = 250
    use_gae = False
    num_updates = 1e5
    num_steps = 32
    sleep_freq = 0
    sleep_skip = False
    memory_capacity = num_steps*fps*20
    ppo_epoch = 1
    num_mini_batch = 32
    value_loss_coef = 0.5
    entropy_coef = 0.01
    lr = 1e-3
    eps = 1e-5
    max_grad_norm = 0.2
    clip_param = 0.2
    gamma = 0.99
    gae_lambda = 0.95
    use_proper_time_limits = False

In [3]:
env = gym.make('CartPole-v0')
heap = Heap(13, 4, 1, args=args)



In [4]:
episode_rewards = deque(maxlen=args.log_interval)
episode_relaxes = deque(maxlen=args.log_interval)
done = True

for j in count(len(heap.units[0].action_losses) + 1):
    heap.clear_memory()
    if done:
        state = env.reset()
        episode_rewards.append(0)
        episode_relaxes.append(0)
        heap.reset()
    for unit in heap.units:
        utils.update_linear_schedule(unit.agent.optimizer, j, args.num_updates, args.lr)
    for step in range(args.num_steps):
        if args.sleep_freq and episode_rewards[-1] % args.sleep_freq == 0 and episode_rewards[-1]:
            heap.sleep(skip=args.sleep_skip)
        action, zeros_mask = heap(state, n=args.fps)
        action = int(action > 0.5)
        if len(zeros_mask[0]):
            if state[2] > 0:
                action = 0
            else:
                action = 1
        state, reward, done, info = env.step(action)
        heap.reward(reward)
        episode_rewards[-1] += reward
        if done:
            heap.done()
        if len(zeros_mask[0]) and reward:
            episode_relaxes[-1] += 1
    heap.update()
    if j % args.log_interval == 0:
        print('Iter: %d, Avg/Max/Min. reward: %0.1f/%0.1f/%0.1f, Avg relaxation: %0.2f' % (j, sum(episode_rewards)/len(episode_rewards), max(episode_rewards), min(episode_rewards), sum(episode_relaxes)/sum(episode_rewards)))



Iter: 250, Avg/Max/Min. reward: 14.7/55.0/8.0, Avg relaxation: 0.23
Iter: 500, Avg/Max/Min. reward: 9.8/21.0/8.0, Avg relaxation: 0.78
Iter: 750, Avg/Max/Min. reward: 8.7/11.0/8.0, Avg relaxation: 1.00
Iter: 1000, Avg/Max/Min. reward: 8.9/14.0/8.0, Avg relaxation: 0.96
Iter: 1250, Avg/Max/Min. reward: 10.5/64.0/8.0, Avg relaxation: 0.74
Iter: 1500, Avg/Max/Min. reward: 11.9/39.0/8.0, Avg relaxation: 0.57
Iter: 1750, Avg/Max/Min. reward: 9.4/22.0/8.0, Avg relaxation: 0.86
Iter: 2000, Avg/Max/Min. reward: 9.4/16.0/8.0, Avg relaxation: 0.88
Iter: 2250, Avg/Max/Min. reward: 9.4/21.0/8.0, Avg relaxation: 0.88
Iter: 2500, Avg/Max/Min. reward: 9.2/14.0/8.0, Avg relaxation: 0.90
Iter: 2750, Avg/Max/Min. reward: 9.1/12.0/8.0, Avg relaxation: 0.92
Iter: 3000, Avg/Max/Min. reward: 9.0/13.0/8.0, Avg relaxation: 0.94
Iter: 3250, Avg/Max/Min. reward: 9.0/14.0/8.0, Avg relaxation: 0.96
Iter: 3500, Avg/Max/Min. reward: 8.9/15.0/8.0, Avg relaxation: 0.97
Iter: 3750, Avg/Max/Min. reward: 9.0/13.0/8.0, A

KeyboardInterrupt: 

In [None]:
n = 1000
def moving_average(a, k=n) :
    ret = np.cumsum(a, dtype=float)
    ret[k:] = ret[k:] - ret[:-k]
    return ret[k - 1:] / k
print('INPUTS')
for j in range(heap.num_inputs):
    label = 'Unit %d' % (j + 1)
    plt.plot(list(range(len(heap.weights)))[:1-10*n], moving_average(list(map(lambda x : x[j], heap.weights)), 10*n), label=label)
plt.title('Weight')
plt.legend()
plt.show()
print('HIDDEN')
for j in range(heap.num_units - heap.num_inputs - heap.num_outputs):
    label = 'Unit %d' % (j + 1)
    plt.plot(list(range(len(heap.weights)))[:1-10*n], moving_average(list(map(lambda x : x[heap.num_inputs + j], heap.weights)), 10*n), label=label)
plt.title('Weight')
plt.legend()
plt.show()
print('OUTPUTS')
for j in range(heap.num_outputs):
    label = 'Unit %d' % (j + 1)
    plt.plot(list(range(len(heap.weights)))[:1-10*n], moving_average(list(map(lambda x : x[heap.num_units - j - 1], heap.weights)), 10*n), label=label)
plt.title('Weight')
plt.legend()
plt.show()
heap.plot_stats(n=n)