In [1]:
import gym
import torch
import numpy as np

from ppo import PPO
from model import MLPActorCritic

In [2]:
def bipedal_walker():
    return gym.make('BipedalWalker-v2')

env_constructor = bipedal_walker
env = env_constructor()

seed = 0
num_workers = 10

obs_dim = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

buffer_size = 7680
lr = 0.0005
max_steps = int(2e6)

gamma = .99
GAE_lambda = .9

epochs = 10
minibatch_size = 256

clip_ratio = 0.16
initial_sd = .5

max_ep_len = 200

device = torch.device('cpu')
if torch.cuda.is_available():
    print("Running PyTorch on GPU")
    device = torch.device('cuda')

actor_layer_sizes = [obs_dim] + [32,32] + [2*num_actions]
critic_layer_sizes = [obs_dim] + [32,32] + [1]
    
class Network(MLPActorCritic):
    def __init__(self, device):
        super().__init__(actor_layer_sizes, critic_layer_sizes, initial_sd, num_actions, device)
        
ac_constructor = Network

ppo = PPO(env_constructor, 
          ac_constructor, 
          buffer_size = buffer_size, 
          max_steps = max_steps, 
          gamma = gamma, 
          clip_ratio = clip_ratio,
          lr = lr, 
          epochs = epochs, 
          batch_size = minibatch_size, 
          lam = GAE_lambda, 
          save_freq = 25,
          save_path = "models/bipedal", 
          log_path = "tensorboard/bipedal",
          loading_type = "none",
          load_path = "",
          device = device, 
          input_normalization = True, 
          max_ep_len = max_ep_len, 
          num_workers = num_workers, 
          seed = seed)

Running PyTorch on GPU


In [3]:
ppo.train(print_freq=10)

Iteration: 10, steps: 76800, mean loss: -0.009072457434143871, mean KL divergence: 0.007267270977996911 average return: -43.424742391167726, average episode length: 136.78643869371496
Iteration: 20, steps: 153600, mean loss: -0.008658384288661182, mean KL divergence: 0.008609465962935551 average return: -3.5243355187853695, average episode length: 189.190243902439
Saved model at iteration: 25
Iteration: 30, steps: 230400, mean loss: -0.008260964438008766, mean KL divergence: 0.007964840163011102 average return: -2.3290315818615444, average episode length: 190.1921724333522
Iteration: 40, steps: 307200, mean loss: -0.007185199558812504, mean KL divergence: 0.008450971283484252 average return: -0.5231324746553118, average episode length: 189.68083623693377
Saved model at iteration: 50
Iteration: 50, steps: 384000, mean loss: -0.007582757863681764, mean KL divergence: 0.008945776831591501 average return: 1.7443061126380166, average episode length: 191.53170731707314
Iteration: 60, steps: 