In [1]:
import gym
from scipy.special import expit as sigmoid
import numpy as np
import numpy.random as npr
import time
from itertools import count
from collections import deque
import matplotlib.pyplot as plt
from a2c_ppo_acktr import utils
import random

from heap import Heap

In [2]:
class args(object):
    bandwidth = 3
    postwidth = 3
    threshold = 0.8
    slip_reward = -0.1
    signal_split = 1
    fps = 25
    log_interval = 10
    use_gae = False
    num_updates = 1e5
    num_steps = 32
    memory_capacity = num_steps*fps + 1
    clip_param = 0.2
    ppo_epoch = 4
    num_mini_batch = 32
    value_loss_coef = 0.5
    entropy_coef = 0.01
    lr = 1e-2
    eps = 1e-5
    max_grad_norm = 0.2
    gamma = 0.99
    gae_lambda = 0.95
    use_proper_time_limits = False

In [3]:
env = gym.make('CartPole-v0')
heap = Heap(13, 4, 1, args=args)



In [4]:
episode_rewards = deque(maxlen=10)
episode_randoms = deque(maxlen=10)
done = True

for j in count(1):
    heap.clear_memory()
    if done:
        state = env.reset()
        episode_rewards.append(0)
        episode_randoms.append(0)
    for unit in heap.units:
        utils.update_linear_schedule(unit.agent.optimizer, j, args.num_updates, args.lr)
    for step in range(args.num_steps):
        action, zeros_mask = heap(state, n=args.fps)
        action = int(action > 0.5)
        if len(zeros_mask[0]):
            action = random.randint(0, 1)
        state, reward, done, info = env.step(action)
        episode_rewards[-1] += reward
        if done:
            heap.done()
        elif len(zeros_mask[0]):
            episode_randoms[-1] += 1
    heap.update()
    print('Iter: %d, Avg/Max/Min. reward: %0.1f/%0.1f/%0.1f, Avg randomness: %0.2f' % (j, sum(episode_rewards)/len(episode_rewards), max(episode_rewards), min(episode_rewards), sum(episode_randoms)/sum(episode_rewards)))



Iter: 1, Avg/Max/Min. reward: 12.0/12.0/12.0, Avg randomness: 0.92
Iter: 2, Avg/Max/Min. reward: 18.5/25.0/12.0, Avg randomness: 0.95
Iter: 3, Avg/Max/Min. reward: 18.3/25.0/12.0, Avg randomness: 0.95
Iter: 4, Avg/Max/Min. reward: 20.0/25.0/12.0, Avg randomness: 0.95
Iter: 5, Avg/Max/Min. reward: 21.2/26.0/12.0, Avg randomness: 0.95
Iter: 6, Avg/Max/Min. reward: 19.3/26.0/10.0, Avg randomness: 0.95
Iter: 7, Avg/Max/Min. reward: 19.6/26.0/10.0, Avg randomness: 0.95
Iter: 8, Avg/Max/Min. reward: 21.1/32.0/10.0, Avg randomness: 0.96
Iter: 9, Avg/Max/Min. reward: 23.8/53.0/10.0, Avg randomness: 0.96
Iter: 10, Avg/Max/Min. reward: 23.4/53.0/10.0, Avg randomness: 0.96


KeyboardInterrupt: 

In [None]:
heap.plot_stats(n=100)