## CEM for Policy Optimization. Use OpenAi Gym and Numpy.

### Lab1 -> Deep Reinforcement Learning - John Schulman MLSS
http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab1.html

1 -> Import Modules

In [None]:
from __future__ import print_function
import numpy as np
import gym
from gym.spaces import Discrete, Box

2 -> Define discrete action policy generator

In [None]:
class DeterministicDiscreteActionLinearPolicy(object):
    def __init__(self, theta, obs_space, act_space):
        obs_dim = obs_space.shape[0]
        act_dim = act_space.n
        assert len(theta) == obs_dim * act_dim + act_dim
        self.W = theta[0 : obs_dim * act_dim].reshape(obs_dim, act_dim)
        self.b = theta[obs_dim * act_dim : None]
        
    def act(self, obs):
        aprob = np.dot(obs, self.W) + self.b
        act = aprob.argmax()
        return act

3 -> Define continuous action policy generator

In [None]:
class DeterministicContinuousActionLinearPolicy(object):
    def __init__(self, theta, obs_space, act_space):
        obs_dim = obs_space.shape[0]
        act_dim = act_space.shape[0]
        assert len(theta) == obs_dim * act_dim + act_dim
        self.W = theta[0 : obs_dim * act_dim].reshape(obs_dim, act_dim)
        self.b = theta[obs_dim * act_dim : None]
        self.act_space = act_space
        
    def act(self, obs):
        aprob = np.dot(obs, self.W) + self.b
        act = np.clip(aprob, self.act_space.low, self.act_space.high)
        return act

4 -> Generate appropriate policy using theta

In [None]:
def make_policy(theta):
    if isinstance(env.action_space, Discrete):
        return DeterministicDiscreteActionLinearPolicy(theta, 
            env.observation_space, env.action_space)
    elif isinstance(env.action_space, Box):
        return DeterministicContinuousActionLinearPolicy(theta,
            env.observation_space, env.action_space)
    else:
        return NotImplementedError

5 -> Do an episode using policy

In [None]:
def do_episode(policy, render=False):
    obs = env.reset()
    total_rew = 0
    for t in range(num_step):
        act = policy.act(obs)
        obs, rew, done, _info = env.step(act)
        total_rew = total_rew + rew
        if render and t % 3 == 0: env.render()
        if done: break
    return total_rew

6 -> Evaluate a policy

In [None]:
def noisy_evaluation(theta):
    policy = make_policy(theta)
    rew = do_episode(policy)
    return rew

7 -> Define parameters

In [None]:
num_step = 500
n_iter = 100
batch_size = 25
elite_fac = 0.2

8 -> Initialize game

In [None]:
# env = gym.make("CartPole-v0") # Discrete
# env = gym.make("Pendulum-v0") # Continuous
# env = gym.make("Acrobot-v1") # Discrete
env = gym.make("MountainCar-v0") # Discrete

[2017-04-14 14:56:57,130] Making new env: MountainCar-v0


8 -> Initialize mean and std of theta

In [None]:
if isinstance(env.action_space, Discrete):
    theta_dim = (env.observation_space.shape[0] + 1) * env.action_space.n
elif isinstance(env.action_space, Box):
    theta_dim = (env.observation_space.shape[0] + 1) * env.action_space.shape[0]
else:
    raise NotImplementedError
theta_mean = np.zeros(theta_dim)
theta_std = np.ones(theta_dim)

Main Loop
1. Sample %batch_size% thetas ~ N(theta_mean, theta_std)
2. For each theta in thetas, do an episode, get the reward f1, f2, ..., fn
3. Get top %elite_fac%% reward, to get top %elite_fac%% theta named elite set
4. Fit elite set with N(theta_mean', theta_std') using max(mean, std)(sigma(fi×logp(thetai)))
5. Update theta_mean and theta_std with theta_mean' and theta_std'

In [None]:
for iteration in range(n_iter):
    thetas = np.random.multivariate_normal(theta_mean, np.eye(theta_dim) * theta_std, batch_size)
    rewards = [noisy_evaluation(theta) for theta in thetas]
    n_elite = int(batch_size * elite_fac)
    elite_inds = np.argsort(rewards)[batch_size - n_elite : None]
    elite_thetas = [thetas[i] for i in elite_inds]
    theta_mean = np.sum(np.array([rewards[i] for i in elite_inds]).reshape(-1, 1) * np.array(elite_thetas), axis=0) / np.sum(np.array([rewards[i] for i in elite_inds]))
    theta_std = np.sum(np.array([rewards[i] for i in elite_inds]).reshape(-1, 1) * np.square(np.array(elite_thetas) - theta_mean), axis=0) / np.sum(np.array([rewards[i] for i in elite_inds]))
    print("Iteration %i. mean f: %8.3g. max f: %8.3g" % (iteration + 1, np.mean(rewards), np.max(rewards)))
    # do_episode(make_policy(theta_mean), render=True)

Iteration 1. mean f:     -200. max f:     -200
Iteration 2. mean f:     -200. max f:     -200
Iteration 3. mean f:     -200. max f:     -200
Iteration 4. mean f:     -200. max f:     -200
Iteration 5. mean f:     -200. max f:     -200
Iteration 6. mean f:     -200. max f:     -200
Iteration 7. mean f:     -200. max f:     -200
Iteration 8. mean f:     -200. max f:     -200
Iteration 9. mean f:     -200. max f:     -200
Iteration 10. mean f:     -200. max f:     -200
Iteration 11. mean f:     -200. max f:     -200
Iteration 12. mean f:     -200. max f:     -200
Iteration 13. mean f:     -200. max f:     -200
Iteration 14. mean f:     -200. max f:     -200
Iteration 15. mean f:     -200. max f:     -200
Iteration 16. mean f:     -200. max f:     -200
Iteration 17. mean f:     -200. max f:     -200
Iteration 18. mean f:     -200. max f:     -200
Iteration 19. mean f:     -200. max f:     -200
Iteration 20. mean f:     -200. max f:     -200
Iteration 21. mean f:     -200. max f:     -200
I