In [218]:
import numpy as np
import torch as tr
import gym
from gym.spaces import Box
import torch.nn as nn
import torch.nn.functional as F
import torch.tensor as tensor
EPS = 1e-8


class Buffer:

    def __init__(self):
        # for evaluation at end of epoch
        self.reset_epoch()
        # for storing an episode
        self.reset_episode()

    def reset_episode(self):
        self.ep_o, self.ep_a,self.ep_r,self.ep_v = [],[],[],[]
        self.ep_l = 0

    def reset_epoch(self):
        self.obs_buf, self.acts_buf,self.advs_buf,self.rtgs_buf = [] ,[],[],[]
        self.logp_prev = None


    def store_batch(self,ep_obs,ep_acts,ep_advs,ep_rtgs):
        # when episode is over, appends episode vals to batch
        self.obs_buf += ep_obs
        self.acts_buf += ep_acts
        self.advs_buf += ep_advs
        self.rtgs_buf += ep_rtgs

    def get_batch(self):

        b_a, b_o = np.array(self.acts_buf).reshape(-1), np.array(self.obs_buf)
        # important: for continuous action space reshape acts to [batch_size,1]
        b_a = b_a.reshape(-1,1)
        # normalize trick
        b_adv  = np.array((self.advs_buf - np.mean(self.advs_buf))/(np.std(self.advs_buf) + 1e-8))
        b_rtg = np.array(self.rtgs_buf)

        return [b_o,b_a,b_adv,b_rtg]

    def __len__(self):
        return len(self.obs_buf)

    def store_episode(self,o,a,r,v):
        self.ep_o.append(o)
        self.ep_a.append(a)
        self.ep_r.append(r)
        self.ep_v.append(v)
        self.ep_l+=1

    def get_episode(self):
        return self.ep_o,self.ep_a,self.ep_r,self.ep_v,self.ep_l

    

class Logger:
    """
    Logs relevant values and prints them
    """

    def __init__(self):
        self.reset_logger()

    def reset_logger(self):
        self.train_r, self.ep_len = [],[]

    def store(self, train_r=None, ep_len=None, train=True):
        if train:
            self.train_r.append(train_r)
            self.ep_len.append(ep_len)
        else:
            pass

    def get_vals(self):

        vals = np.round([np.mean(self.train_r),np.mean(self.ep_len)],2)
        return vals


    def print_epoch(self, epoch, loss1, loss2):
        train_r, ep_len = self.get_vals()

        print('epoch {0}  pi_loss {1:.3f}  v_loss {2:.3f}  episode length {3}  returns {4}'.format(epoch,loss1,loss2, ep_len,train_r ))
        self.reset_logger()
        
# env helpers
def env_setup(env_name):
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]

    assert isinstance(env.action_space,Box), "Sorry this VPG only works with continuous action spaces"
    act_dim = env.action_space.shape[0]


    return env,obs_dim,act_dim

def reset_env(env):
    obs, rew, done, ep_rews, ep_vals = env.reset(), 0, False, [], []
    return obs, rew, done, ep_rews, ep_vals



def network(in_dim,out_dim,hidden_dim=32,activation=nn.Tanh,out_activation=None):
    layers = [nn.Linear(in_dim, hidden_dim), activation(),
              nn.Linear(hidden_dim, hidden_dim), activation(),
              nn.Linear(hidden_dim, out_dim)]
    if out_activation:
        layers.append(out_activation())
    
    return nn.Sequential(*layers)

def log_likelihood(a,mu,std):
    summand = (a-mu)**2/(std+EPS)**2 + 2*torch.log(std) + torch.log(2*torch.tensor(np.pi))
    return -.5*torch.sum(summand,1)

def gaussian_policy(obs,act,obs_dim,act_dim,hidden_dim=32,a=nn.Tanh,a_out=None):
    # mu is a function
    # pi is a function
    # noise is a function
    
    mu_net = network(obs_dim,act_dim,hidden_dim,a,a_out)
    mu = mu_net(obs)
    log_std = torch.tensor(-0.5*np.ones(act_dim,dtype=np.float32))
    std = torch.exp(log_std)
    noise = torch.normal(tensor(np.zeros(mu.shape)),tensor(np.ones(mu.shape))).float()
    pi = mu + noise * std
    logp = log_likelihood(act,mu,std)
    logpi = log_likelihood(pi,mu,std)
    return pi, logp,logpi


def actor_critic(obs,act,obs_dim,act_dim,hidden_dim=64,a=nn.Tanh,a_out=None):
    pi, logp, _ =  gaussian_policy(obs, act, obs_dim,act_dim, hidden_dim, a, a_out)
    vf = network(obs_dim,1,hidden_dim,a) # may need to squeeze
    return pi, logp, vf

def discount_cumsum(rews, gamma):
    y = gamma**np.arange(len(rews))
    gamma_mat=[np.roll(y, i, axis=0) for i in range(len(y))]
    rews_mat = np.repeat([rews], [len(rews)], axis=0)
    rews_mat = np.triu(rews_mat)*gamma_mat
    return np.sum(rews_mat,axis=1)

class Actor(nn.Module):
    def __init__(self,obs_dim,act_dim,h_dim):
        super(Actor, self).__init__()

        self.layer1 = nn.Linear(obs_dim, h_dim)
        self.layer2 = nn.Linear(h_dim, h_dim)
        self.layer3 = nn.Linear(h_dim, act_dim)  # Prob of Left

    def forward(self, x):
        x = F.tanh(self.layer1(x))
        x = F.tanh(self.layer2(x))
        x = self.layer3(x)
        return x

    def gaussian_policy(self,x):
        mu = self.forward(x)
        log_std = -0.5*torch.ones(act_dim)
        std = tr.exp(log_std)
        noise = tr.normal(torch.zeros(mu.shape),torch.ones(mu.shape))
        pi = mu + noise * std
        return pi,mu,std

    def log_prob(self,obs,acts):
        pi,mu,std = self.gaussian_policy(obs)
        return log_likelihood(acts,mu,std)
    
class Critic(nn.Module):
    def __init__(self,obs_dim,h_dim):
        super(Critic, self).__init__()

        self.layer1 = nn.Linear(obs_dim, h_dim)
        self.layer2 = nn.Linear(h_dim, h_dim)
        self.layer3 = nn.Linear(h_dim, 1)  # Prob of Left

    def forward(self, x):
        x = F.tanh(self.layer1(x))
        x = F.tanh(self.layer2(x))
        x = self.layer3(x)
        return x


def gaussian_policy(x,actor):
    mu = actor(x)
    log_std = -0.5*torch.ones(act_dim)
    std = torch.exp(log_std)
    noise = torch.normal(tensor(np.zeros(mu.shape)),tensor(np.ones(mu.shape))).float()
    pi = mu + noise * std
    return pi,mu,std

def log_prob(obs,acts,actor):
    
    pi,mu,std = gaussian_policy(obs,actor)
    print(pi,mu,std)
    return log_likelihood(acts,mu,std)
    




In [219]:
actor = Actor(obs_dim,act_dim,256)
obs = tensor(np.random.random_sample((32,4))).float()
acts = tensor(np.random.random_sample((32,1))).float()

def gaussian_policy(x):
    mu = actor(x)
    log_std = -0.5*torch.ones(act_dim)
    std = tr.exp(log_std)
    noise = tr.normal(torch.zeros(mu.shape),torch.ones(mu.shape))
    pi = mu + noise * std
    return pi,mu,std

def log_prob(obs,acts,actor):
    
    pi,mu,std = gaussian_policy(obs,actor)
    print(pi,mu,std)
    return log_likelihood(acts,mu,std)
print(actor.log_prob(obs,acts))
tr.mean(actor.log_prob(obs,acts))

tensor([-1.7224, -0.6312, -1.9547, -0.4724, -0.4911, -1.3226, -0.6250, -0.8205,
        -1.3231, -0.9611, -0.5322, -0.4250, -1.0231, -0.5980, -0.4985, -1.2943,
        -1.0169, -0.5199, -1.3583, -1.7503, -1.3446, -0.4894, -1.0663, -1.5073,
        -0.4410, -1.8719, -0.6539, -0.6794, -1.1414, -0.8046, -1.2949, -0.4379],
       grad_fn=<MulBackward0>)


tensor(-0.9710, grad_fn=<MeanBackward1>)

In [270]:
import numpy as np
import torch as tr
import gym
from gym.spaces import Box
import torch.nn as nn
import torch.nn.functional as F
import torch.tensor as tensor
EPS = 1e-8


class Buffer:

    def __init__(self):
        # for evaluation at end of epoch
        self.reset_epoch()
        # for storing an episode
        self.reset_episode()

    def reset_episode(self):
        self.ep_o, self.ep_a,self.ep_r,self.ep_v = [],[],[],[]
        self.ep_l = 0

    def reset_epoch(self):
        self.obs_buf, self.acts_buf,self.advs_buf,self.rtgs_buf = [] ,[],[],[]
        self.logp_prev = None


    def store_batch(self,ep_obs,ep_acts,ep_advs,ep_rtgs):
        # when episode is over, appends episode vals to batch
        self.obs_buf += ep_obs
        self.acts_buf += ep_acts
        self.advs_buf += ep_advs
        self.rtgs_buf += ep_rtgs

    def get_batch(self):

        b_a, b_o = np.array(self.acts_buf).reshape(-1), np.array(self.obs_buf)
        # important: for continuous action space reshape acts to [batch_size,1]
        b_a = b_a.reshape(-1,1)
        # normalize trick
        b_adv  = np.array((self.advs_buf - np.mean(self.advs_buf))/(np.std(self.advs_buf) + 1e-8))
        b_rtg = np.array(self.rtgs_buf)

        return [b_o,b_a,b_adv,b_rtg]

    def __len__(self):
        return len(self.obs_buf)

    def store_episode(self,o,a,r,v):
        self.ep_o.append(o)
        self.ep_a.append(a)
        self.ep_r.append(r)
        self.ep_v.append(v)
        self.ep_l+=1

    def get_episode(self):
        return self.ep_o,self.ep_a,self.ep_r,self.ep_v,self.ep_l

    

class Logger:
    """
    Logs relevant values and prints them
    """

    def __init__(self):
        self.reset_logger()

    def reset_logger(self):
        self.train_r, self.ep_len = [],[]

    def store(self, train_r=None, ep_len=None, train=True):
        if train:
            self.train_r.append(train_r)
            self.ep_len.append(ep_len)
        else:
            pass

    def get_vals(self):

        vals = np.round([np.mean(self.train_r),np.mean(self.ep_len)],2)
        return vals

    def print_epoch(self, epoch, loss1, loss2):
        train_r, ep_len = self.get_vals()

        print('epoch {0}  pi_loss {1:.3f}  v_loss {2:.3f}  episode length {3}  returns {4}'.format(epoch,loss1,loss2, ep_len,train_r ))
        self.reset_logger()
        
# env helpers
def env_setup(env_name):
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]

    assert isinstance(env.action_space,Box), "Sorry this VPG only works with continuous action spaces"
    act_dim = env.action_space.shape[0]


    return env,obs_dim,act_dim

def reset_env(env):
    obs, rew, done, ep_rews, ep_vals = env.reset(), 0, False, [], []
    return obs, rew, done, ep_rews, ep_vals

def log_likelihood(a,mu,std):
    summand = (a-mu)**2/(std+EPS)**2 + 2*tr.log(std) + tr.log(2*tensor(np.pi))
    return -.5*tr.sum(summand,1)

def discount_cumsum(rews, gamma):
    y = gamma**np.arange(len(rews))
    gamma_mat=[np.roll(y, i, axis=0) for i in range(len(y))]
    rews_mat = np.repeat([rews], [len(rews)], axis=0)
    rews_mat = np.triu(rews_mat)*gamma_mat
    return np.sum(rews_mat,axis=1)

class Actor(nn.Module):
    def __init__(self,obs_dim,act_dim,h_dim):
        super(Actor, self).__init__()

        self.layer1 = nn.Linear(obs_dim, h_dim)
        self.layer2 = nn.Linear(h_dim, h_dim)
        self.layer3 = nn.Linear(h_dim, act_dim)  # Prob of Left

    def forward(self, x):
        x = F.tanh(self.layer1(x))
        x = F.tanh(self.layer2(x))
        x = self.layer3(x)
        return x

    def gaussian_policy(self,x):
        mu = self.forward(x)
        log_std = -0.5*torch.ones(act_dim)
        std = tr.exp(log_std)
        noise = tr.normal(torch.zeros(mu.shape),torch.ones(mu.shape))
        pi = mu + noise * std
        return pi,mu,std

    def log_prob(self,obs,acts):
        pi,mu,std = self.gaussian_policy(obs)
        return log_likelihood(acts,mu,std)
    
class Critic(nn.Module):
    def __init__(self,obs_dim,h_dim):
        super(Critic, self).__init__()

        self.layer1 = nn.Linear(obs_dim, h_dim)
        self.layer2 = nn.Linear(h_dim, h_dim)
        self.layer3 = nn.Linear(h_dim, 1)  # Prob of Left

    def forward(self, x):
        x = F.tanh(self.layer1(x))
        x = F.tanh(self.layer2(x))
        x = F.relu(self.layer3(x))
        return x



# main train loop
def train(env_name='InvertedPendulum-v2',
          pi_lr=1e-2,vf_lr=1e-3, gamma=0.99, lam=.95, n_iters=50, batch_size=5000
          ):

    env,obs_dim,act_dim = env_setup(env_name)
    #adv_ph,ret_ph = placeholders(2) # replace these
    pi = Actor(obs_dim,act_dim,256)
    vf = Critic(obs_dim,64)
 
    loss_pi = lambda obs,acts,adv: -torch.mean(log_probs(obs,acts,pi)*adv)
    optimizer_pi = torch.optim.Adam(pi.parameters(),lr=pi_lr)
    
    mse = nn.MSELoss(reduction='sum')
    loss_v = lambda rets,obs: mse(rets,vf(obs))
    optimizer_v = torch.optim.Adam(vf.parameters(),lr=vf_lr)
    
    def train_one_iteration(epoch):
        # one epoch loop
        buffer = Buffer()
        logger = Logger()

        obs, rew, done, ep_rews, ep_vals = reset_env(env)
        while True:
            # one episode loop

            act = pi.gaussian_policy(tensor(obs).float())[0].detach().numpy() # TODO SET ACTION HERE USING PI #sess.run(pi, {obs_ph: obs.reshape(1,-1)})[0]
            v_t = vf(tensor(obs).float())[0].detach().numpy() # TODO GET VALUE FUNC HERE USING ACTOR_CRITIC # sess.run(vf, {obs_ph: obs.reshape(1,-1)})[0]
            
         
            obs2, rew, done, _ = env.step(act)
            
            
            buffer.store_episode(obs,act,rew,v_t)

            obs=obs2

            if done:
                # add episode to batch

                # get episode from buffer
                ep_obs, ep_acts, ep_rews, ep_vals, ep_len = buffer.get_episode() #len(ep_rews)

                # run GAE to get advs
                # outputs estimate for adv
                # if agent died, last_value=reward
                alive = ep_len == env._max_episode_steps
                last_val = vf(tensor(obs).float()).detach().numpy()[0] if alive else rew

                # start advantage compute
                # add last value to compute TD \gamma * V_{t+1} - V_{t}
                ep_rews.append(last_val)
                ep_vals.append(last_val)

                # compute deltas for GAE
                deltas=np.array(ep_rews[:-1]) + gamma * np.array(ep_vals[1:]) - np.array(ep_vals[:-1])

                # go back to how it was
                ep_rews = ep_rews[:-1]
                ep_vals = ep_vals[:-1]

                ep_advs = list(discount_cumsum(deltas, gamma*lam))
                ep_rtgs = list(discount_cumsum(ep_rews, gamma))

                buffer.store_batch(ep_obs, ep_acts,ep_advs,ep_rtgs)
                logger.store(sum(ep_rews),len(ep_rews))

                # reset episode

                buffer.reset_episode()

                obs, rew, done, ep_rews, ep_vals = reset_env(env)
                if len(buffer) > batch_size:
                    break
        
       
        b_o, b_a, b_adv, b_rtg = buffer.get_batch()
        b_o, b_a, b_adv, b_rtg = tensor(b_o).float(),tensor(b_a).float(),tensor(b_adv).float(),tensor(b_rtg).float()
        
        
    
        optimizer_pi.zero_grad()
        # get log-likelihoods of state-action pairs
        logp = pi.log_prob(b_o,b_a)
        # choose loss to maximize likelihood*advantage
        loss_pi = -tr.mean(logp*b_adv)
        loss_pi.backward()
        optimizer_pi.step()
        
        optimizer_v.zero_grad()
        
        
        loss_v = tr.mean((b_rtg-vf(b_o).flatten())**2)
        loss_v.backward()
        optimizer_v.step()
        logger.print_epoch(epoch,loss_pi.detach().numpy(),np.sqrt(loss_v.detach().numpy()))

    for epoch in range(n_iters):
        train_one_iteration(epoch)
        
if __name__ == '__main__':
    train()

epoch 0  pi_loss -0.036  v_loss 7.576  episode length 9.52  returns 9.52
epoch 1  pi_loss 0.023  v_loss 5.311  episode length 7.72  returns 7.72
epoch 2  pi_loss -0.009  v_loss 10.905  episode length 12.84  returns 12.84
epoch 3  pi_loss 0.013  v_loss 9.910  episode length 13.2  returns 13.2
epoch 4  pi_loss 0.008  v_loss 16.281  episode length 23.32  returns 23.32
epoch 5  pi_loss -0.015  v_loss 18.351  episode length 27.58  returns 27.58
epoch 6  pi_loss -0.008  v_loss 8.875  episode length 11.48  returns 11.48
epoch 7  pi_loss -0.053  v_loss 28.667  episode length 48.17  returns 48.17
epoch 8  pi_loss -0.025  v_loss 24.878  episode length 37.52  returns 37.52
epoch 9  pi_loss 0.018  v_loss 25.836  episode length 40.85  returns 40.85
epoch 10  pi_loss 0.012  v_loss 21.886  episode length 32.43  returns 32.43
epoch 11  pi_loss 0.004  v_loss 19.498  episode length 27.14  returns 27.14
epoch 12  pi_loss 0.006  v_loss 22.321  episode length 35.05  returns 35.05
epoch 13  pi_loss -0.011  

In [262]:
torch.ones(1)

tensor([1.])

In [124]:

obs_dim = 4
act_dim =1 
obs = torch.randn(32, 4)
a = torch.randn(32, 1)
mu = network(obs_dim,act_dim,256)
#mu(obs)
#gaussian_policy(obs,a,obs_dim,act_dim,hidden_dim=32,a=nn.Tanh,a_out=None)
x,y,z = actor_critic(obs,a,obs_dim,act_dim,64,a=nn.Tanh,a_out=None)


In [82]:
network(obs_dim,act_dim)

mu = network(obs_dim,act_dim)
log_std = tensor(-0.5*np.ones(act_dim,dtype=np.float32))
std = torch.exp(log_std)
#pi = mu(obs) + torch.normal(tensor(np.zeros(mu.shape)),tensor(np.ones(mu,shape))) * std

print(mu(obs).shape)

AttributeError: 'numpy.ndarray' object has no attribute 'dim'

In [74]:
env,obs_dim,act_dim = env_setup('InvertedPendulum-v2')
obs,_,_,_,_  = reset_env(env)

gaussian_policy(tensor(obs),tensor([.1]),obs_dim,act_dim)

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'mat2'

In [58]:
x = tensor(np.ones((3,2)))
x.shape

torch.Size([3, 2])

(array([0.00204692, 0.00725553, 0.00259891, 0.00921754]), 0, False, [], [])

In [49]:
import gym

env = gym.make('CartPole-v0')
env.action_space.shape
env.observation_space.shape


(4,)

In [5]:
import torch as tr
# some labels
labels = torch.arange(3)
labels = labels.reshape(3, 1)

num_classes = 4
one_hot_target = (labels == torch.arange(num_classes).reshape(1, num_classes)).float()

In [6]:
one_hot_target

tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]])

In [37]:
import torch as tr
a = tr.tensor([0,1,1,0,1,0])
a = a.reshape(-1,1)
act_dim = 2

one_hot = (a == tr.arange(act_dim)).float()


tensor([[1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.]])

In [28]:
tr.arange(act_dim).float()

tensor([0., 1.])

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# Softmax is also in torch.nn.functional
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([0.6145, 0.2061, 1.0119, 0.4910, 1.2089])
tensor([0.1710, 0.1137, 0.2544, 0.1511, 0.3098])
tensor(1.)
tensor([-1.7662, -2.1745, -1.3688, -1.8897, -1.1718])


In [48]:
from torch.distributions import Categorical

m = Categorical(torch.tensor([[ 9, 0.25, 0.25, 0.25 ],[ 9, 0.25, 0.25, 0.25 ]]))
m.sample()

tensor([0, 0])

In [50]:
from itertools import count

for t in count():
    print(t)

    if t==10:break

0
1
2
3
4
5
6
7
8
9
10


In [56]:
from torch.autograd import Variable
from torch import autograd
x = Variable(torch.Tensor([0.1, 0.1]), requires_grad=True)
f = 3 * x[0] ** 2 + 4 * x[0] * x[1] + x[1] **2
grad_f, = autograd.grad(f, x, create_graph=True)
z = grad_f @ v
z.backward()
x.grad

tensor([10.,  6.])

In [62]:
import torch
from torch.autograd import grad

def nth_derivative(f, wrt, n):

    for i in range(n):

        grads = grad(f, wrt, create_graph=True)[0]
        f = grads.sum()

    return grads

x = torch.arange(4, requires_grad=True,dtype=torch.float).reshape(2, 2)
loss = (x ** 4).sum()

print(nth_derivative(f=loss, wrt=x, n=3))

tensor([[ 0., 24.],
        [48., 72.]], grad_fn=<MulBackward0>)


In [65]:
import numpy as np
import tensorflow as tf
import gym
from gym.spaces import Box
tf.reset_default_graph()

EPS = 1e-8

# basic utils
def placeholders(n):
    assert (n>1),"must provide more than 1 placeholder"
    return [tf.placeholder(shape=(None,), dtype=tf.float32) for i in range(n)]

# env helpers
def env_setup(env_name):
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)

    assert isinstance(env.action_space,Box), "Sorry this PPO only works with continuous action spaces"
    act_dim = env.action_space.shape[0]
    act_ph = tf.placeholder(shape=(None,1), dtype=tf.float32)


    return env,obs_dim,act_dim, obs_ph, act_ph

def reset_env(env):
    obs, rew, done, ep_rews, ep_vals = env.reset(), 0, False, [], []
    return obs, rew, done, ep_rews, ep_vals

# actor-critic construction
def network(x,hidden_units=[32,32,1],a=tf.tanh,a_out=None):
    for u in hidden_units[:-1]:
        x = tf.layers.dense(x,units=u,activation=a)
    return tf.layers.dense(x,units=hidden_units[-1],activation=a_out)

def log_likelihood(a,mu,std):
    summand = (a-mu)**2/(std+EPS)**2 + 2*tf.log(std) + tf.log(2*np.pi)
    return -.5*tf.reduce_sum(summand,axis=1)

def gaussian_policy(obs,act,act_dim,hidden_units=[32,32],a=tf.tanh,a_out=None):
    mu = network(obs,hidden_units+[act_dim],a,a_out)
    log_std = tf.get_variable(name='log_std',initializer=-0.5*np.ones(act_dim,dtype=np.float32))
    std = tf.exp(log_std)
    pi = mu + tf.random_normal(tf.shape(mu)) * std
    logp = log_likelihood(act,mu,std)
    logpi = log_likelihood(pi,mu,std)
    return pi, logp,logpi

def value_approx(obs,hidden_units=[64,64],a=tf.tanh):
    return tf.squeeze(network(obs,hidden_units+[1],a),axis=1)

def actor_critic(x,act,act_dim,hidden_units=[64,64],a=tf.tanh,a_out=None):
    pi, logp, _ =  gaussian_policy(x, act, act_dim, hidden_units, a, a_out)
    vf = value_approx(x,hidden_units=[64,64],a=tf.tanh)
    return pi, logp, vf

# reward helper
def discount_cumsum(rews, gamma):
    y = gamma**np.arange(len(rews))
    gamma_mat=[np.roll(y, i, axis=0) for i in range(len(y))]
    rews_mat = np.repeat([rews], [len(rews)], axis=0)
    rews_mat = np.triu(rews_mat)*gamma_mat
    return np.sum(rews_mat,axis=1)

# Experience buffer and logger classes

class Buffer:
    
    def __init__(self):
        # for evaluation at end of epoch
        self.reset_epoch()
        # for storing an episode
        self.reset_episode()
        
    def reset_episode(self):
        self.ep_o, self.ep_a,self.ep_r,self.ep_v = [],[],[],[]
        self.ep_l = 0
    
    def reset_epoch(self):
        self.obs_buf, self.acts_buf,self.advs_buf,self.rtgs_buf = [] ,[],[],[]
        self.logp_prev = None
        
        
    def store_batch(self,ep_obs,ep_acts,ep_advs,ep_rtgs):
        # when episode is over, appends episode vals to batch
        self.obs_buf += ep_obs
        self.acts_buf += ep_acts
        self.advs_buf += ep_advs
        self.rtgs_buf += ep_rtgs
    
    def get_batch(self):
        
        b_a, b_o = np.array(self.acts_buf).reshape(-1), np.array(self.obs_buf)
        # important: for continuous action space reshape acts to [batch_size,1] 
        b_a = b_a.reshape(-1,1)
        # normalize trick
        b_adv  = np.array((self.advs_buf - np.mean(self.advs_buf))/(np.std(self.advs_buf) + 1e-8))
        b_rtg = np.array(self.rtgs_buf)
        
        return [b_o,b_a,b_adv,b_rtg]
    
    def get_buffer_size(self):
        return len(self.obs_buf)
    
    def store_episode(self,o,a,r,v):
        self.ep_o.append(o)
        self.ep_a.append(a)
        self.ep_r.append(r)
        self.ep_v.append(v)
        self.ep_l+=1
    
    def get_episode(self):
        return self.ep_o,self.ep_a,self.ep_r,self.ep_v,self.ep_l
        
class Logger:
    """
    Logs relevant values and prints them 
    """
    
    def __init__(self):
        self.reset_logger()

    def reset_logger(self):
        self.train_r, self.ep_len = [],[]
        
    def store(self, train_r=None, ep_len=None, train=True):
        if train:
            self.train_r.append(train_r)
            self.ep_len.append(ep_len)
        else:
            pass
                
    def get_vals(self):

        vals = np.round([np.mean(self.train_r),np.mean(self.ep_len)],2)
        return vals
    

    def print_epoch(self, epoch, loss1, loss2):
        train_r, ep_len = self.get_vals()
        
        print('epoch {0}  pi_loss {1:.3f}  v_loss {2:.3f}  episode length {3}  returns {4}'.format(epoch,loss1,loss2, ep_len,train_r ))
        self.reset_logger()

# main train loop
def train(env_name='InvertedPendulum-v2',
          lr=3e-2,vf_lr=1e-1, gamma=0.99, lam=.95, n_iters=50, batch_size=5000
          ):
    
    env,obs_dim,act_dim, obs_ph, act_ph = env_setup(env_name)
    adv_ph,ret_ph = placeholders(2)
    
    pi, logp, vf = actor_critic(obs_ph, act_ph,act_dim=act_dim)
    
    loss_pi, loss_v = -tf.reduce_mean(logp*adv_ph), tf.reduce_mean((ret_ph-vf)**2)
    train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss_pi)
    train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(loss_v)

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    
    def train_one_iteration(epoch):
        # one epoch loop 
        buffer = Buffer()
        logger = Logger()
        
        obs, rew, done, ep_rews, ep_vals = reset_env(env)
        while True:
            # one episode loop
        
            act = sess.run(pi, {obs_ph: obs.reshape(1,-1)})[0]
            v_t = sess.run(vf, {obs_ph: obs.reshape(1,-1)})[0]
            obs2, rew, done, _ = env.step(act)
            
            buffer.store_episode(obs,act,rew,v_t)
            
            obs=obs2
            
            if done:
                # add episode to batch 
                
                # get episode from buffer
                ep_obs, ep_acts, ep_rews, ep_vals, ep_len = buffer.get_episode() #len(ep_rews)
                
                # run GAE to get advs
                # outputs estimate for adv
                # if agent died, last_value=reward
                alive = ep_len == env._max_episode_steps
                last_val = sess.run(vf, {obs_ph: obs.reshape(1,-1)})[0] if alive else rew 

                # start advantage compute
                # add last value to compute TD \gamma * V_{t+1} - V_{t}
                ep_rews.append(last_val)
                ep_vals.append(last_val)

                # compute deltas for GAE
                deltas=np.array(ep_rews[:-1]) + gamma * np.array(ep_vals[1:]) - np.array(ep_vals[:-1])  

                # go back to how it was
                ep_rews = ep_rews[:-1]
                ep_vals = ep_vals[:-1]

                ep_advs = list(discount_cumsum(deltas, gamma*lam))
                ep_rtgs = list(discount_cumsum(ep_rews, gamma))

                buffer.store_batch(ep_obs, ep_acts,ep_advs,ep_rtgs)
                logger.store(sum(ep_rews),len(ep_rews))
                
                # reset episode

                buffer.reset_episode()
                
                obs, rew, done, ep_rews, ep_vals = reset_env(env)
                if (buffer.get_buffer_size() > batch_size):
                    break
        
        b_o, b_a, b_adv, b_rtg = buffer.get_batch()

        inputs = {obs_ph: b_o, act_ph: b_a, adv_ph: b_adv, ret_ph: b_rtg}     
        
        sess.run([train_pi,train_v], feed_dict=inputs)        
        b_loss_pi, b_loss_v =  sess.run([loss_pi,loss_v], feed_dict=inputs)
        
        logger.print_epoch(epoch,b_loss_pi,b_loss_v)

    for epoch in range(n_iters):
        train_one_iteration(epoch)
        
        

if __name__ == '__main__':
    train()


epoch 0  pi_loss -2.026  v_loss 263.924  episode length 23.21  returns 23.21
epoch 1  pi_loss -1.091  v_loss 84.591  episode length 14.19  returns 14.19
epoch 2  pi_loss -0.710  v_loss 345.974  episode length 35.11  returns 35.11
epoch 3  pi_loss -0.002  v_loss 585.976  episode length 60.67  returns 60.67
epoch 4  pi_loss -0.713  v_loss 489.698  episode length 51.57  returns 51.57
epoch 5  pi_loss 0.040  v_loss 884.475  episode length 108.0  returns 108.0
epoch 6  pi_loss -0.040  v_loss 582.901  episode length 99.84  returns 99.84
epoch 7  pi_loss -0.017  v_loss 1315.972  episode length 170.07  returns 170.07
epoch 8  pi_loss -0.099  v_loss 405.469  episode length 106.15  returns 106.15
epoch 9  pi_loss -0.313  v_loss 529.724  episode length 135.21  returns 135.21
epoch 10  pi_loss -0.007  v_loss 2511.908  episode length 1000.0  returns 1000.0
epoch 11  pi_loss -0.032  v_loss 1644.718  episode length 551.6  returns 551.6
epoch 12  pi_loss -0.061  v_loss 882.640  episode length 284.61  

In [67]:
x = torch.tensor([1,2,3,4])
y = torch.tensor([2,2,1,5])
torch.min(x,y)

tensor([1, 2, 1, 4])

In [12]:
import gym

env = gym.make('CartPole-v0')

state = env.reset()
print(state)
min_pos = -2.0
max_pos = 2.0
symmetric_states = []


[ 0.03665843  0.03732891  0.01799848 -0.03420282]


In [14]:
np.linspace(min_pos,max_pos,10)

array([-2.3       , -1.78888889, -1.27777778, -0.76666667, -0.25555556,
        0.25555556,  0.76666667,  1.27777778,  1.78888889,  2.3       ])