In [1]:
from torch.distributions import Categorical
import gym
import torch.nn as nn
from py_inforce.generic.mlp import MLP
from py_inforce.policy_based.REINFORCE import REINFORCE
import torch.optim as optim
import torch
import numpy as np

env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0] # 4
out_dim = env.action_space.n # 2
cart_agent = MLP([in_dim, 128, 128, out_dim], nn.ReLU)
optimizer = optim.Adam(cart_agent.parameters(), lr=cart_agent.lr)

REINFORCE(cart_agent, env, Categorical, optimizer, 200, bf = lambda x: x - x.mean(), MAX_EPISODES=500, EARLY = lambda x: x == 200)

######
# Test
######

done = False

state = env.reset()
rewards = 0

while not done:
    state = torch.from_numpy(state.astype(np.float32))
    pd = Categorical(logits=cart_agent.forward(state))
    action = pd.sample()
    state, reward, done, _ = env.step(action.numpy())
    rewards += reward
    #env.render()
    
rewards

187.0

In [7]:
import torch
import numpy as np

d = np.zeros(5)
m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
a = m.sample()
d[0] = m.log_prob(a)
d

array([-1.38629436,  0.        ,  0.        ,  0.        ,  0.        ])

In [42]:
import torch
import torch.nn as nn
import numpy as np

def REINFORCE(agent, env, dist, optimizer, EPI_LEN , bf = lambda x: x, DISC_FACTOR = 0.95, MAX_EPISODES = 300, EARLY = lambda x: False):
  """
  A first implementation of the REINFORCE Algorithm.
  
  Works only on discrete action spaces so far

  Args:
    agent:   Acting agent/policy estimator eg a multilayer perceptron that will be trained 
    env:     Environment
    dist:    Parametric torch distribution
    optimizer: Torch optimizer
    EPI_LEN:   (uint) Max number of steps in an Episode.
    bf: (function: 1d torch.tensor -> 1d torch.tensor) basis function for scaling of returns. Default identity function
    DISC_FACTOR: (float) Discount factor aka gamma
    MAX_EPISODES: Number of episodes that should be sampled. Default = 300
    EARLY: (function: 1d-sequence -> Boolean) Function that returns true if training should be early stopped depending on achieved rewards
    

  Examples:
    from torch.distributions import Categorical
    import gym
    import torch.nn as nn
    from py_inforce.generic.mlp import MLP
    import torch.optim as optim

    env = gym.make('CartPole-v0')
    in_dim = env.observation_space.shape[0] # 4
    out_dim = env.action_space.n # 2
    cart_agent = MLP([in_dim, 128, 128, out_dim], nn.ReLU)
    optimizer = optim.Adam(cart_agent.parameters(), lr=cart_agent.lr)

    REINFORCE(cart_agent, env, Categorical, optimizer, 200, bf = lambda x: x - x.mean(), MAX_EPISODES=500, EARLY = lambda x: x == 200)
  """
  for episode in range(MAX_EPISODES):
    #####################
    # Sample a trajectory
    #####################
    done = False

    state = env.reset()

    rewards = torch.zeros(EPI_LEN)
    log_probs = torch.zeros(EPI_LEN)
    
    step = 0
    while not done:
      state = torch.from_numpy(state.astype(np.float32))
      pd = dist(logits=agent.forward(state))
      action = pd.sample()
      state, reward, done, _ = env.step(action.numpy())
      rewards[step] = reward
      log_probs[step] = pd.log_prob(value=action)
      step += 1


    ############################
    # Compute discounted Returns
    ############################

    returns = torch.zeros(step)
    ret = 0.0
    for t in reversed(range(step)):
      ret = rewards[t] + DISC_FACTOR * ret
      returns[t] = ret
    
    returns = bf(returns) 
    
    ###################    
    # Update policy net
    ###################
    
    log_probs = log_probs[:step]
    loss = torch.sum(- log_probs * returns)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if EARLY(sum(rewards)):
        break

#######    
# Train
#######

from torch.distributions import Categorical
import gym
import torch.nn as nn
from py_inforce.generic.mlp import MLP
import torch.optim as optim

env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0] # 4
out_dim = env.action_space.n # 2
cart_agent = MLP([in_dim, 128, 128, out_dim], nn.ReLU)
optimizer = optim.Adam(cart_agent.parameters(), lr=cart_agent.lr)

REINFORCE(cart_agent, env, Categorical, optimizer, 200, bf = lambda x: x - x.mean(), MAX_EPISODES=500, EARLY = lambda x: x == 200)

In [53]:
######
# Test
######

done = False

state = env.reset()
rewards = 0

while not done:
    state = torch.from_numpy(state.astype(np.float32))
    pd = Categorical(logits=cart_agent.forward(state))
    action = pd.sample()
    state, reward, done, _ = env.step(action.numpy())
    rewards += reward
    #env.render()
    
rewards

200.0

In [32]:
cart_agent

MLP(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [56]:
import torch 
import torch.nn as nn
import numpy as np

class MLP(nn.Module):
  """
  Multilayer perceptron template
  """
  def __init__(self, dims, activation = nn.ReLU, weight_init = None, LEARN_RATE = 0.01):
    """
    Generic template for multilayer perceptrons
    
    Args:
      dims: (1d iterable) eg list, contains sizes of each layer
      activation: (function) Activation function. Default is torch.nn.ReLU
      weight_init: (function) Initializes weights. Default torch.nn.init.xavier_uniform_. Applied to each layernn.apply
      LEARN_RATE: (float) learning rate. DEFAULT = 0.01. Passed on to optimizer
      
    Examples:
      cart_agent = MLP([4, 6, 6, 2])
      cart_agent = MLP([4, 6, 6, 2], nn.LeakyReLU)
      
      def weight_init(m):
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.01) 

      cart_agent = MLP([4, 6, 6, 2], nn.LeakyReLU, weight_init)
    """
    super(MLP, self).__init__()
    
    if weight_init == None:   
        def weight_init(m):
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01) 
    
    layers = []

    layers.append(nn.Linear(dims[0], dims[1]))
    weight_init(layers[-1])
    
    for d in range(1, len(dims) - 1):
      layers.append(activation())
      layers.append(nn.Linear(dims[d], dims[d+1]))
      weight_init(layers[-1])
    
  
    self.model = nn.Sequential(*layers)

    self.lr = LEARN_RATE

  def forward(self, x):
    return self.model(x)


def weight_init(m):
    torch.nn.init.xavier_normal_(m.weight)
    m.bias.data.fill_(0.01) 

cart_agent = MLP([4, 6, 6, 2], nn.LeakyReLU, weight_init)


cart_agent.model[0].weight

Parameter containing:
tensor([[ 0.4122,  0.4504,  0.4007,  0.8678],
        [ 0.5437,  0.4246, -0.0835,  0.0781],
        [ 0.2869, -0.2023, -0.3366, -0.2826],
        [ 0.0464,  0.7584, -0.2475,  0.3501],
        [ 0.0872,  0.0964, -0.6560,  0.6187],
        [ 0.2616,  0.2514, -0.1930, -0.8168]], requires_grad=True)