In [1]:
import gym
import itertools
import matplotlib
import numpy as np
import time
import os
import tensorflow as tf

In [2]:
class Normalizer():
    '''
    A class to record and normalize the 
    state encountered and calculate the 
    mean and covariance. 
    '''
    def __init__(self, nb_inputs, mode):
        '''
        mode=0 for V1/V1-t
        mode=1 for V2/V2-t
        '''
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
        self.mode = mode
    
    def observe(self, x):
        '''
        Update the statistics
        '''
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, ob):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        if self.mode == 0:
            return ob
        elif self.mode == 1:
            return (ob - obs_mean) / obs_std
        else:
            raise NotImplementedError

In [3]:
class Worker(object):

    def __init__(self,env_seed=123,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 layer_size=None,
                 delta_std=0.02,
                 ):
        
        # Initialize OpenAI environment for each worker
        
        self.env = gym.make(env_name)
        self.env.seed(env_seed)
        self.delta_std = delta_std
        
    def explore(self, env, normalizer, policy, direction = None, delta = None):
        '''
        Based on the input policy, explore the environment
        '''
        state = env.reset()
        done = False
        num_plays = 0.
        sum_rewards = 0
        for num_plays in itertools.count():
            normalizer.observe(state)
            state = normalizer.normalize(state)
            action = policy.evaluate(state, delta, self.delta_std, direction)
            state, reward, done, _ = env.step(action)
            sum_rewards += reward
            if done or num_plays > 1000:
                break
        return sum_rewards

    def do_rollout(self, num_rollouts, normalizer, policy): 
        '''
        Generate random direction, and then evaluate the
        policy positively and negatively
        '''
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas(num_rollouts)
        positive_rewards = [0] * num_rollouts
        negative_rewards = [0] * num_rollouts

        # Getting the positive rewards in the positive directions
        for k in range(num_rollouts):
            positive_rewards[k] = self.explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
        # Getting the negative rewards in the negative/opposite directions
        for k in range(num_rollouts):
            negative_rewards[k] = self.explore(env, normalizer, policy, direction = "negative", delta = deltas[k])

        return {'deltas': deltas, 'positive_rewards': positive_rewards, 'negative_rewards': negative_rewards}

In [23]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 learning_rate=0.02,
                 delta_std=0.02, 
                 num_iter=1000,
                 layer_size=None,
                 seed=123,
                 mode=0
                ):
        print('Here')
        self.policy = Policy(policy_params['ob_dim'], policy_params['ac_dim'],layer_size)
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.normalizer = Normalizer(policy_params['ob_dim'],mode)          
        self.worker = Worker(env_seed=seed + 7,
                             env_name=env_name,
                             policy_params=policy_params,
                             layer_size=layer_size,
                             delta_std=delta_std)
        
    def train(self,num_rollouts,max_b):

        for t in range(self.num_iter):
            t1 = time.time() 
            result_dict = self.worker.do_rollout(num_rollouts, self.normalizer, self.policy) 
            #Gather the result

            deltas = result_dict['deltas']
            positive_rewards = result_dict['positive_rewards']
            negative_rewards = result_dict['negative_rewards']

            #Update the weight
            all_rewards = np.array(positive_rewards + negative_rewards)
            sigma_r = all_rewards.std()

            # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:max_b]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]

            # Updating our policy
            self.policy.update(rollouts, sigma_r, self.learning_rate, max_b)

            # Printing the final reward of the policy after the update
            reward_evaluation = self.worker.explore(env, self.normalizer, self.policy)
            t2 = time.time() 
            print('total time of one step:', t2 - t1,', reward=',reward_evaluation,', iter ', t,' done')      
    
#     def evaluate(self,test_num,isRender):
#         returns = []
#         for i in range(test_num):
#             print('iter', i)
#             state = env.reset()
#             done = False
#             num_plays = 0
#             sum_rewards = 0
#             while not done and num_plays < 1000:
#                 state = self.normalizer.normalize(state)
#                 action = self.policy.evaluate(state, None, delta_std, None)
#                 state, reward, done, _ = env.step(action)
#                 sum_rewards += reward
#                 num_plays += 1
#                 if isRender:
#                     env.render()   
#                 if num_plays % 100 == 0: print("%i/%i"%(num_plays, env.spec.timestep_limit))

#             returns.append(totalr)

#         print('returns', returns)
#         print('mean return', np.mean(returns))
#         print('std of return', np.std(returns))


In [24]:
# Exploring the policy on one specific direction and over one episode
class Policy():
    """
    Policy Function approximator. 
    """
    
    def __init__(self, input_size, output_size, layer_size):
        self.depth = len(layer_size)
        self.weights = []
        for i_layer in layer_size:
            self.weights.append(np.random.rand(*i_layer))
    
    def evaluate(self, input, delta = None, delta_std =  0.02, direction = None):
        layer = input
        if direction is None: 
            for i in range(self.depth - 1):
                layer = relu(self.weights[i].dot(layer))
            return np.tanh(self.weights[self.depth - 1].dot(layer)) * 2
        elif direction == "positive":
            for i in range(self.depth - 1):
                layer = relu((self.weights[i] + delta_std * delta[i]).dot(layer))
            return np.tanh((self.weights[self.depth - 1] + delta_std * delta[self.depth - 1]).dot(layer)) * 2
        else:
            for i in range(self.depth - 1):
                layer = relu((self.weights[i] - delta_std * delta[i]).dot(layer))
            return np.tanh((self.weights[self.depth - 1] - delta_std * delta[self.depth - 1]).dot(layer)) * 2
                                                      
    def sample_deltas(self,num_rollouts):
        delta = []
        for _ in range(num_rollouts):
            d = []
            for i in range(self.depth):
                d.append(np.random.randn(*self.weights[i].shape))
            delta.append(d)
        return delta
    
    def update(self, rollouts, sigma_r,learning_rate,max_b):
        step = []
        for i in range(self.depth):
            step.append(np.zeros(self.weights[i].shape))
            
        for r_pos, r_neg, delta in rollouts:
            for i in range(self.depth):
                d = delta[i]
                step[i] += (r_pos - r_neg) * d
                
        for i in range(self.depth):
            self.weights[i] += learning_rate / (max_b * sigma_r) * step[i]

In [25]:
def relu(input):
    return input * (input > 0)

In [27]:
#Test the algorithm
env_name = 'Pendulum-v0'
env = gym.make(env_name)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
print(policy_params)
learning_rate=0.005
delta_std=0.01
num_iter=10000
num_rollouts = 16
max_b=4
mid_size = 20
layer_size = [[mid_size, ob_dim], [ac_dim, mid_size]]
seed = 123
mode=1

Learner = ARSLearner(env_name,
                     policy_params,
                     learning_rate,
                     delta_std, 
                     num_iter,
                     layer_size,
                     seed,
                     mode)

Learner.train(num_rollouts,max_b)

{'ob_dim': 3, 'ac_dim': 1}
Here
total time of one step: 0.23935484886169434 , reward= -1306.839634383308 , iter  0  done
total time of one step: 0.24234342575073242 , reward= -1374.7820031348629 , iter  1  done
total time of one step: 0.2423412799835205 , reward= -1491.29455497441 , iter  2  done
total time of one step: 0.23978924751281738 , reward= -1657.3021002586788 , iter  3  done
total time of one step: 0.25702452659606934 , reward= -1437.3680548623704 , iter  4  done
total time of one step: 0.259885311126709 , reward= -1559.6627039368725 , iter  5  done
total time of one step: 0.24335265159606934 , reward= -1390.6419605357537 , iter  6  done
total time of one step: 0.25481700897216797 , reward= -1710.4890150623992 , iter  7  done
total time of one step: 0.2677340507507324 , reward= -1471.5218067611936 , iter  8  done
total time of one step: 0.25530052185058594 , reward= -1634.8068578714192 , iter  9  done
total time of one step: 0.25711631774902344 , reward= -1489.9826123976568 ,

KeyboardInterrupt: 