In [2]:
import gym
import itertools
import matplotlib
import numpy as np
import time
import os

In [13]:
class Normalizer():
    '''
    A class to record and normalize the 
    state encountered and calculate the 
    mean and covariance. 
    '''
    def __init__(self, nb_inputs, mode):
        '''
        mode=0 for V1/V1-t
        mode=1 for V2/V2-t
        '''
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
        self.mode = mode
    
    def observe(self, x):
        '''
        Update the statistics
        '''
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, ob):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        if self.mode == 0:
            return ob
        elif self.mode == 1:
            return (ob - obs_mean) / obs_std
        else:
            raise NotImplementedError

In [14]:
class Worker(object):

    def __init__(self,env_seed=123,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 delta_std=0.02,
                 ):
        
        # Initialize OpenAI environment for each worker
        
        self.env = gym.make(env_name)
        self.env.seed(env_seed)
        self.delta_std = delta_std
        
    def explore(self, env, normalizer, policy, direction = None, delta = None):
        '''
        Based on the input policy, explore the environment
        '''
        state = env.reset()
        done = False
        num_plays = 0.
        sum_rewards = 0
        for num_plays in itertools.count():
            normalizer.observe(state)
            state = normalizer.normalize(state)
            action = policy.evaluate(state, delta, self.delta_std, direction)
            state, reward, done, _ = env.step(action)
            sum_rewards += reward
            if done or num_plays > 1000:
                break
        return sum_rewards

    def do_rollout(self, num_rollouts, normalizer, policy): 
        '''
        Generate random direction, and then evaluate the
        policy positively and negatively
        '''
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas(num_rollouts)
        positive_rewards = [0] * num_rollouts
        negative_rewards = [0] * num_rollouts

        # Getting the positive rewards in the positive directions
        for k in range(num_rollouts):
            positive_rewards[k] = self.explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
        # Getting the negative rewards in the negative/opposite directions
        for k in range(num_rollouts):
            negative_rewards[k] = self.explore(env, normalizer, policy, direction = "negative", delta = deltas[k])

        return {'deltas': deltas, 'positive_rewards': positive_rewards, 'negative_rewards': negative_rewards}

In [17]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 learning_rate=0.02,
                 delta_std=0.02, 
                 num_iter=1000,
                 seed=123,
                 mode=0
                ):
        self.policy = Policy(policy_params['ob_dim'], policy_params['ac_dim'])
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.normalizer = Normalizer(policy_params['ob_dim'],mode)          
        self.worker = Worker(env_seed=seed + 7,
                             env_name=env_name,
                             policy_params=policy_params,
                             delta_std=delta_std)
        
    def train(self,num_rollouts,max_b):

        for t in range(self.num_iter):
            t1 = time.time() 
            result_dict = self.worker.do_rollout(num_rollouts, self.normalizer, self.policy) 
            #Gather the result

            deltas = result_dict['deltas']
            positive_rewards = result_dict['positive_rewards']
            negative_rewards = result_dict['negative_rewards']

            #Update the weight
            all_rewards = np.array(positive_rewards + negative_rewards)
            sigma_r = all_rewards.std()

            # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:max_b]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]

            # Updating our policy
            self.policy.update(rollouts, sigma_r, self.learning_rate, max_b)

            # Printing the final reward of the policy after the update
            reward_evaluation = self.worker.explore(env, self.normalizer, self.policy)
            t2 = time.time() 
            print('total time of one step:', t2 - t1,', reward=' ,reward_evaluation,', iter ', t,' done')      
    
    def evaluate(self,test_num,isRender):
        returns = []
        print(self.policy.theta)
        for i in range(test_num):
            print('iter', i)
            state = env.reset()
            done = False
            num_plays = 0
            sum_rewards = 0
            while not done and num_plays < 1000:
                state = self.normalizer.normalize(state)
                action = self.policy.evaluate(state, None, delta_std, None)
                state, reward, done, _ = env.step(action)
                sum_rewards += reward
                num_plays += 1
                if isRender:
                    env.render()   
                if num_plays % 100 == 0: print("%i/%i"%(steps, env.spec.timestep_limit))

            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))


In [18]:

# Building the AI

class Policy():
    
    def __init__(self, input_size, output_size):
        self.theta = np.zeros((output_size, input_size))
    
    def evaluate(self, input, delta = None, delta_std =  0.02, direction = None):
        if direction is None:
            return self.theta.dot(input)
        elif direction == "positive":
            return (self.theta + delta_std*delta).dot(input)
        else:
            return (self.theta - delta_std*delta).dot(input)
    
    def sample_deltas(self,num_rollouts):
        return [np.random.randn(*self.theta.shape) for _ in range(num_rollouts)]
    
    def update(self, rollouts, sigma_r,learning_rate,max_b):
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
        self.theta += learning_rate / (max_b * sigma_r) * step

# Exploring the policy on one specific direction and over one episode

In [19]:
#Test the algorithm
env_name = 'HalfCheetah-v2'
env = gym.make(env_name)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
learning_rate=0.02
delta_std=0.03
num_iter=1000
num_rollouts = 8
max_b=8
seed = 123
mode=1
Learner = ARSLearner(env_name,
                     policy_params,
                     learning_rate,
                     delta_std, 
                     num_iter,
                     seed,
                     mode)

Learner.train(num_rollouts,max_b)

total time of one step: 2.5376009941101074 , reward= 0.8397265753729617 , iter  0  done
total time of one step: 2.4842889308929443 , reward= -0.07918574750612375 , iter  1  done
total time of one step: 2.559300184249878 , reward= -0.7451602830053257 , iter  2  done
total time of one step: 2.5057170391082764 , reward= 0.6103123485866337 , iter  3  done
total time of one step: 2.447580337524414 , reward= -1.4011496769055614 , iter  4  done
total time of one step: 3.044841766357422 , reward= -0.4055209406903972 , iter  5  done
total time of one step: 3.1943259239196777 , reward= -0.5275213573039131 , iter  6  done
total time of one step: 3.960387706756592 , reward= -0.9270337793239882 , iter  7  done
total time of one step: 3.232692003250122 , reward= -0.6171118502465192 , iter  8  done
total time of one step: 4.106821060180664 , reward= 6.36267476382277 , iter  9  done
total time of one step: 4.690614938735962 , reward= 5.753988707362596 , iter  10  done
total time of one step: 3.4926688

total time of one step: 2.5127439498901367 , reward= 3599.00023899927 , iter  94  done
total time of one step: 2.4081668853759766 , reward= 3752.789980804012 , iter  95  done
total time of one step: 2.544081926345825 , reward= 3500.0216232032412 , iter  96  done
total time of one step: 2.2697861194610596 , reward= 3958.472986023582 , iter  97  done
total time of one step: 2.8822860717773438 , reward= 3627.6032916835447 , iter  98  done
total time of one step: 2.6118998527526855 , reward= 3390.7206414766897 , iter  99  done
total time of one step: 2.456216812133789 , reward= 3542.6670429539276 , iter  100  done
total time of one step: 2.483604907989502 , reward= 3479.0620058484806 , iter  101  done
total time of one step: 2.4816651344299316 , reward= 3656.9133347296356 , iter  102  done
total time of one step: 2.391184091567993 , reward= 3591.922802735222 , iter  103  done
total time of one step: 2.7095460891723633 , reward= 3838.937195439113 , iter  104  done
total time of one step: 2.

total time of one step: 2.550812005996704 , reward= 4329.298807414825 , iter  187  done
total time of one step: 2.5724809169769287 , reward= 4390.356277539324 , iter  188  done
total time of one step: 2.3370931148529053 , reward= 4266.33289155821 , iter  189  done
total time of one step: 2.337106943130493 , reward= 4384.828435530442 , iter  190  done
total time of one step: 2.336374044418335 , reward= 4497.734216146323 , iter  191  done
total time of one step: 2.360903024673462 , reward= 4353.98436518564 , iter  192  done
total time of one step: 2.3696088790893555 , reward= 4361.791785286776 , iter  193  done
total time of one step: 2.452301025390625 , reward= 4411.906638998138 , iter  194  done
total time of one step: 2.3881897926330566 , reward= 4530.60263704686 , iter  195  done
total time of one step: 2.3469419479370117 , reward= 4671.7556598454175 , iter  196  done
total time of one step: 2.435854196548462 , reward= 4623.2005720667285 , iter  197  done
total time of one step: 2.32

total time of one step: 2.4039649963378906 , reward= 4358.490024929984 , iter  280  done
total time of one step: 2.2913782596588135 , reward= 4552.507235647334 , iter  281  done
total time of one step: 2.6911890506744385 , reward= 4284.442959083113 , iter  282  done
total time of one step: 3.3714311122894287 , reward= 4597.492904695417 , iter  283  done
total time of one step: 3.308701753616333 , reward= 4568.034205860775 , iter  284  done
total time of one step: 3.1416633129119873 , reward= 4696.096730937096 , iter  285  done
total time of one step: 2.5231475830078125 , reward= 4509.483533420056 , iter  286  done
total time of one step: 2.2798659801483154 , reward= 4505.7785897359345 , iter  287  done
total time of one step: 2.484851121902466 , reward= 4810.962744348545 , iter  288  done
total time of one step: 2.3863701820373535 , reward= 4733.191008199599 , iter  289  done
total time of one step: 2.53358793258667 , reward= 4920.830719881392 , iter  290  done
total time of one step: 

KeyboardInterrupt: 

In [1]:
Learner.evaluate(test_num=10,isRender=False)

NameError: name 'Learner' is not defined