In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import wrappers

In [None]:
ENV_NAME = 'BipedalWalker-v2'   # Name of OpenAI gym environment
REWARD_HISTORY = []             # History of rewards used for plotting

## Hyperparameters

In [None]:
class Hp:
    # Hyperparameters
    
    def __init__(self,
                 num_steps=1500,
                 episode_length=3000,
                 num_deltas=20,
                 num_best_deltas=20,
                 record_every=100,
                 env_name='BipedalWalker-v2',
                 seed=42,
                 noise=0.03,
                 alpha=0.02):
        self.num_steps = num_steps
        self.episode_length = episode_length
        self.num_deltas = num_deltas
        self.num_best_deltas = num_best_deltas
        assert self.num_best_deltas <= self.num_deltas
        self.record_every = record_every
        self.env_name = env_name
        self.seed = seed
        self.noise = noise
        self.alpha = alpha      # Learning rate


## Normalizer

In [None]:
class Normalizer:
    # Normalizes input values using standard normalization algorithm
    
    def __init__(self, num_inputs):
        # Creates empty arrays of the size of the input space
        self.n = np.zeros(num_inputs)
        self.mean = np.zeros(num_inputs)
        self.mean_diff = np.zeros(num_inputs)
        self.variance = np.zeros(num_inputs)

    def observe(self, x):
        # Computes running average and variance of the input values
        self.n += 1.0
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.variance = (self.mean_diff / self.n).clip(min=1e-2)

    def normalize(self, inputs):
        obs_mean = self.mean                    # Observation mean
        obs_std = np.sqrt(self.variance)        # Observation standard deviation
        return (inputs - obs_mean) / obs_std


## Policy

In [None]:
class Policy:
    # Generates random noise, turns input into actions and updates the policy
    
    def __init__(self, input_size, output_size, hp):
        self.theta = np.zeros((output_size, input_size))    # Initialize weight matrix with zeros
        self.hp = hp                                        # Hyperparameters

    def evaluate(self, input, delta=None, direction=None):
        # Turns input into actions
        if direction is None:
            return self.theta.dot(input)
        elif direction == '+':
            return (self.theta + self.hp.noise * delta).dot(input)
        elif direction == '-':
            return (self.theta - self.hp.noise * delta).dot(input)

    def sample_deltas(self):
        # Generates random noise
        return [np.random.randn(*self.theta.shape) for _ in range(self.hp.num_deltas)]

    def update(self, rollouts, sigma_rewards):
        # Updates the policy
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, delta in rollouts:
            step += (r_pos - r_neg) * delta
        self.theta += self.hp.alpha / (self.hp.num_best_deltas * sigma_rewards) * step


## Agent Trainer

In [None]:
class ArsAgent:
    def __init__(self,
                 hp=None,
                 input_size=None,
                 output_size=None,
                 normalizer=None,
                 policy=None,
                 monitor_dir=None):
        self.hp = hp or Hp
        np.random.seed(self.hp.seed)
        self.env = gym.make(self.hp.env_name)
        if monitor_dir is not None:
            should_record = lambda i: self.record_video
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)
        self.hp.episode_length = self.env.spec.timestep_limit or self.hp.episode_length
        self.input_size = input_size or self.env.observation_space.shape[0]
        self.output_size = output_size or self.env.action_space.shape[0]
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.policy = policy or Policy(self.input_size, self.output_size, self.hp)
        self.record_video = False

    def explore(self, direction=None, delta=None):
        # Explores the policy and returns the sum of the rewards accumulated
        state = self.env.reset()
        done = False
        num_plays = 0.0
        sum_rewards = 0.0
        while not done and num_plays < self.hp.episode_length:
            self.normalizer.observe(state)
            state = self.normalizer.normalize(state)
            action = self.policy.evaluate(state, delta, direction)
            state, reward, done, _ = self.env.step(action)
            reward = max(min(reward, 1), -1)
            sum_rewards += reward
            num_plays += 1
        return sum_rewards

    def train(self):
        # Trains the agent
        for step in range(self.hp.num_steps):
            deltas = self.policy.sample_deltas()
            pos_rewards = [0] * self.hp.num_deltas
            neg_rewards = [0] * self.hp.num_deltas
            
            for k in range(self.hp.num_deltas):
                pos_rewards[k] = self.explore(direction='+', delta=deltas[k])
                neg_rewards[k] = self.explore(direction='-', delta=deltas[k])

            sigma_rewards = np.array(pos_rewards + neg_rewards).std()

            scores = {k:max(r_pos, r_neg) for k,(r_pos, r_neg) in enumerate(zip(pos_rewards, neg_rewards))}
            order = sorted(scores.keys(), key=lambda x:scores[x], reverse=True)[:self.hp.num_best_deltas]
            rollouts = [(pos_rewards[k], neg_rewards[k], deltas[k]) for k in order]

            self.policy.update(rollouts, sigma_rewards)

            if step % self.hp.record_every == 0:
                self.record_video = True

            reward_evaluation = self.explore()
            REWARD_HISTORY.append(reward_evaluation)
            print('Step: {} | Reward: {}'.format(step, reward_evaluation))
            self.record_video = False


## Utility Function

In [None]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path


## Main Code

In [None]:
videos_dir = mkdir('.', 'videos')
monitor_dir = mkdir(videos_dir, ENV_NAME)
hp = Hp(env_name=ENV_NAME)
agent = ArsAgent(hp=hp, monitor_dir=monitor_dir)
agent.train()


## Plot

In [None]:
plt.plot(REWARD_HISTORY)
plt.xlabel('Step')
plt.ylabel('Reward')
plt.title('Rewards over Time')
plt.show()

![plot](https://image.ibb.co/igf4GV/plot.png)

## Result after 800 steps

![demo](https://image.ibb.co/fPkDbV/bipedal.gif)