In [1]:
# TEMP: Import lagom
# Not useful once lagom is installed
import sys
sys.path.append('/home/zuo/Code/lagom/')

# Evolution strategies on OpenAI gym environment

## Create policy network for Pendulum

In [2]:
import gym
from gym.wrappers import Monitor

import torch
import torch.nn as nn
import torch.nn.functional as F

from lagom.agents import BaseAgent
from lagom.envs import EnvSpec, GymEnv
from lagom.core.networks import BaseMLP
from lagom.core.policies import BaseGaussianPolicy


class MLP(BaseMLP):
    def make_params(self, config):
        self.fc1 = nn.Linear(in_features=3, out_features=32)

        self.mean_head = nn.Linear(in_features=32, out_features=1)
        self.logvar_head = nn.Linear(in_features=32, out_features=1)

    def init_params(self, config):
        gain = nn.init.calculate_gain(nonlinearity='relu')

        nn.init.orthogonal_(self.fc1.weight, gain=gain)
        nn.init.constant_(self.fc1.bias, 0.0)

        nn.init.orthogonal_(self.mean_head.weight, gain=gain)
        nn.init.constant_(self.mean_head.bias, 0.0)

        nn.init.orthogonal_(self.logvar_head.weight, gain=gain)
        nn.init.constant_(self.logvar_head.bias, 0.0)

    def forward(self, x):
        x = F.relu(self.fc1(x))

        mean = self.mean_head(x)
        logvar = self.logvar_head(x)

        # Output dictionary
        out = {}
        out['mean'] = mean
        out['logvar'] = logvar

        return out


class GaussianPolicy(BaseGaussianPolicy):
    def process_network_output(self, network_out):
        return {}

    def constraint_action(self, action):
        return 2*torch.tanh(action)
    
    
class Agent(BaseAgent):
    def __init__(self, policy, config):
        super().__init__(config)
        
        self.policy = policy
        
    def choose_action(self, obs):
        obs = torch.from_numpy(obs).float()
        obs = obs.unsqueeze(0)
        
        output_policy = self.policy(obs)
        
        action = output_policy['action']
        action = 2*torch.tanh(action)
        action = action.squeeze(0)
        
        output = {}
        output['action'] = action
        
        return output


def make_env(seed=None, monitor=False, monitor_dir=None):
    env = gym.make('Pendulum-v0')
    if monitor:
        env = Monitor(env, directory=monitor_dir)
    env = GymEnv(env)
    
    if seed is not None:
        env.seed(seed)
    
    return env

In [3]:
MLP().num_params

194

## Create evaluation function

In [4]:
import numpy as np

from lagom.envs import EnvSpec
from lagom.runner import Runner

def rollout(parameters, env, N, T):
    parameters = torch.from_numpy(parameters).float()
    env_spec = EnvSpec(env)
    
    # Create a network
    network = MLP(config=None)
    # Load parameters to the network
    network.from_vec(parameters)
    # Create a Gaussian policy
    policy = GaussianPolicy(network=network, env_spec=env_spec)
    # Create an Agent
    agent = Agent(policy=policy, config=None)
    
    # Create runner
    runner = Runner(agent=agent, env=env, gamma=1.0)
    # Make rollouts
    D = runner(N=N, T=T)
    
    mean_return = np.mean([trajectory.all_returns[0] for trajectory in D])
    
    # Negate the reward, because ES is doing minimization. 
    function_value = -mean_return
    
    return function_value

## Create master-worker classes for ES

In [5]:
from time import time

from lagom.core.es import CMAES, OpenAIES

from lagom.core.es import BaseESWorker
from lagom.core.es import BaseGymESMaster


class ESWorker(BaseESWorker):
    def f(self, solution, seed):
        solution, make_env = solution
        
        # Create an environment and seed it
        env = make_env(seed)
        
        # Evaluate the solution
        function_value = rollout(parameters=solution, 
                                 env=env, 
                                 N=5, 
                                 T=50)
        
        return function_value
    

class ESMaster(BaseGymESMaster):
    def make_es(self):
        cmaes = CMAES(mu0=[0]*194, 
                      std0=0.5, 
                      popsize=60)
        
        return cmaes
        
    def _process_es_result(self, result):
        best_f_val = result['best_f_val']
        if self.generation == 0 or (self.generation+1) % 100 == 0:
            best_f_val = -best_f_val  # negate to get back reward
            print(f'Best function value at generation {self.generation+1}: {best_f_val}')
            
        # Save the parameters in final generation
        if (self.generation+1) == self.num_iteration:
            np.save('trained_param', result['best_param'])
            
            
t = time()

es = ESMaster(make_env=make_env,
              num_iteration=1000, 
              worker_class=ESWorker, 
              num_worker=60, 
              init_seed=0, 
              daemonic_worker=None)
es()

print(f'Total time: {time() - t:.3} s')

(30_w,60)-aCMA-ES (mu_w=16.6,w_1=12%) in dimension 194 (seed=400889, Wed Jul 11 17:16:42 2018)
Best function value at generation 1: -267.0451211258769
Best function value at generation 100: -53.67015787605196
Best function value at generation 200: -28.48752606920898
Best function value at generation 300: -28.48752606920898
Best function value at generation 400: -28.48752606920898
Best function value at generation 500: -1.7494624838232995
Best function value at generation 600: -1.7494624838232995
Best function value at generation 700: -1.7494624838232995
Best function value at generation 800: -1.7494624838232995
Best function value at generation 900: -1.7494624838232995
Best function value at generation 1000: -1.7494624838232995
Total time: 3.14e+02 s


## Evaluate the trained policy

In [28]:
# Load saved parameter
parameters = np.load('trained_param.npy')

# Make environment
env = make_env(seed=None, monitor=True, monitor_dir='logs/')
        
# Evaluate the solution
function_value = rollout(parameters=parameters, 
                         env=env, 
                         N=1, 
                         T=50)
function_value

1.986847748979926