In [1]:
import gym
import torch
import numpy as np

from ppo import PPO
from model import MLPActorCritic

In [2]:
env = gym.make('BipedalWalker-v2')

seed = 0

torch.manual_seed(seed)
np.random.seed(seed)
env.seed(seed)

obs_dim = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

buffer_size = 3840
lr = 0.00054
max_steps = int(1e6)

gamma = .99
GAE_lambda = .902

epochs = 10
minibatch_size = 128

clip_ratio = 0.16
initial_sd = .5

max_ep_len = 200

device = torch.device('cpu')
if torch.cuda.is_available():
    print("Running PyTorch on GPU")
    device = torch.device('cuda')

actor_critic = MLPActorCritic([obs_dim] + [32,32] + [2*num_actions], [obs_dim] + [32,32] + [1], initial_sd, num_actions, device)
    
ppo = PPO(env, 
          actor_critic, 
          buffer_size = buffer_size, 
          max_steps = max_steps, 
          gamma = gamma, 
          clip_ratio = clip_ratio,
          lr = lr, 
          epochs = epochs, 
          batch_size = minibatch_size, 
          lam = GAE_lambda, 
          save_freq=50,
          save_path = "models/Bipedal", 
          device = device, 
          input_normalization=True, 
          time_feature = False,
          max_ep_len=max_ep_len)

Running PyTorch on GPU


In [3]:
ppo.train(print_freq=10)

Iteration: 10, steps: 39088, mean loss: -0.011892180977573582, mean KL divergence: 0.007934944518571185 average return: -46.24976903467844, average episode length: 151.1659749861742
Iteration: 20, steps: 78755, mean loss: -0.010455482362303882, mean KL divergence: 0.009428603608372919 average return: -4.541051293330047, average episode length: 198.335
Iteration: 30, steps: 118303, mean loss: -0.01016498365654702, mean KL divergence: 0.009731225477154376 average return: -4.8478431098612855, average episode length: 197.74
Iteration: 40, steps: 158036, mean loss: -0.009774593089945658, mean KL divergence: 0.009409905039954595 average return: -3.5809832975809073, average episode length: 198.665
Saved model at iteration: 50
Iteration: 50, steps: 197847, mean loss: -0.010498643525765322, mean KL divergence: 0.011761475445307422 average return: -2.9296253538281825, average episode length: 199.055
Iteration: 60, steps: 237736, mean loss: -0.010755511355535183, mean KL divergence: 0.01291354615