In [9]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import time
import os

In [49]:
def create_shared_noise():
    """
    Create a large array of noise to be shared by all workers. Used 
    for avoiding the communication of the random perturbations delta.
    """

    seed = 12345
    count = 250000000
    noise = np.random.RandomState(seed).randn(count).astype(np.float64)
    return noise


class SharedNoiseTable(object):
    def __init__(self, noise, seed = 11):

        self.rg = np.random.RandomState(seed)
        self.noise = noise
        assert self.noise.dtype == np.float64

    def get(self, i, dim):
        return self.noise[i:i + dim]

    def sample_index(self, dim):
        return self.rg.randint(0, len(self.noise) - dim + 1)

    def get_delta(self, dim):
        idx = self.sample_index(dim)
        return self.get(idx, dim)

In [50]:
class Collector(object):
    
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.cov = np.zeros(nb_inputs)
    
    def update(self, ob):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (ob - self.mean) / self.n
        self.mean_diff += (ob - last_mean) * (ob - self.mean)
        self.cov = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def process_ob(self, ob):
        obs_mean = self.mean
        obs_std = np.sqrt(self.cov)
        #return (ob - obs_mean) / obs_std
        return ob

In [56]:
class Worker(object):
    """ 
    Object class for parallel rollout generation.
    """

    def __init__(self,
                 env_seed=123,
                 env_name=None,
                 policy_params=None,
                 noise=None,
                 delta_std=0.02,
                 ):
        
        # initialize OpenAI environment for each worker
        
        self.env = gym.make(env_name)
        self.env.seed(env_seed)
        self.policy = Policy(policy_params)
        self.delta_std = delta_std
        self.delta_table = SharedNoiseTable(noise,seed=env_seed)
        
    def rollout(self,collector,evaluate):
        ob = env.reset()
        total_ob = [ob]
        total_reward = 0
        for t in itertools.count():
            if not evaluate:
                collector.update(ob) 
            ob = collector.process_ob(ob)
            action = self.policy.get_action(ob)
            ob,reward,done,_ = env.step(action)
            total_step = t
            total_reward += reward
            if done or t > 1000:
                break
        return total_reward

    def do_rollout(self, weight, num_rollouts,collector,evaluate):
        if evaluate:
            self.policy.update_weights(weight)
            reward  = self.rollout(collector,evaluate)
            return reward
        else:      
            rollout_rewards,rollout_deltas= [],[]

            for i_rollout in range(num_rollouts):

                idx,delta = self.delta_table.get_delta(weight.size).reshape(weight.shape)
                rollout_deltas.append(delta)

                # compute reward and number of timesteps used for positive perturbation rollout
                self.policy.update_weights(weight + self.delta_std * delta)
                pos_reward  = self.rollout(collector,evaluate)

                # compute reward and number of timesteps used for negative pertubation rollout
                self.policy.update_weights(weight - self.delta_std * delta)
                neg_reward = self.rollout(collector,evaluate) 

                rollout_rewards.append([pos_reward, neg_reward])

            return {'rollout_deltas': rollout_deltas, 'rollout_rewards': rollout_rewards}

In [57]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self,
                 env_seed=123,
                 env_name='HalfCheetah-v2',
                 policy_params=None,
                 l_rate=0.02,
                 num_workers=1,
                 noise=None,
                 delta_std=0.02, 
                 num_iter=1000,
                ):

        self.l_rate = l_rate
        self.collector = Collector(policy_params['ob_dim'])
        self.estimator = Worker(env_seed + 7,
                              env_name=env_name,
                              policy_params=policy_params,
                              noise=noise,
                              delta_std=delta_std)
                
        self.worker = Worker(env_seed + 8,
                               env_name=env_name,
                               policy_params=policy_params,
                               noise=noise,
                               delta_std=delta_std)
        
    def train(self,num_iter,num_rollouts,max_b):
        
        weight = self.estimator.policy.get_weights()
        r_list = []
        for t in range(num_iter):
            t1 = time.time() 
            result_dict = self.worker.do_rollout(weight,num_rollouts,self.collector,False) 
            #Gather the result

            roll_reward = result_dict['rollout_rewards']
            roll_delta = result_dict['rollout_deltas']

            r_list = zip(roll_reward,roll_delta)
            r_list = sorted(r_list,key=lambda tup: max(tup[0][0],tup[0][1]))
            r_list = r_list[0:max_b]

            #Update the weight
            reward_list = []
            cum_diff = np.zeros(weight.shape)
            for roll_r,delta in r_list:
                cum_diff += (roll_r[0] - roll_r[1]) * delta
                reward_list.extend([roll_r[0],roll_r[1]])
                
            reward_list = np.array(reward_list)
            step = self.l_rate / max_b / np.std(reward_list)
            weight += step * cum_diff
            
            creward = self.estimator.do_rollout(weight, num_rollouts,self.collector,True)
            t2 = time.time()
            r_list.append(creward)
            print('total time of one step:', t2 - t1,', reward=' ,creward,', iter ', t,' done')      
        return weight

In [58]:
class Policy(object):
    def __init__(self, policy_params):
        self.ob_dim = policy_params['ob_dim']
        self.ac_dim = policy_params['ac_dim']
        self.weights = np.zeros([self.ac_dim,self.ob_dim])
    def update_weights(self, new_weight):
        self.weights = new_weight
        return
    
    def get_weights(self):
        return self.weights
    
    def get_action(self,ob):
        return np.matmul(self.weights,ob)

In [15]:
print('create the shared noise table')
noise = create_shared_noise()
print('Noise created')

create the shared noise table
Noise created


In [60]:
#Test the algorithm
env_name = 'Swimmer-v2'
env = gym.make(env_name)
env_seed = 1234
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
l_rate=0.02
num_workers=1
delta_std=0.01
num_iter=1000
#seed=123
mode=1
num_rollouts = 1
max_b=1
Learner = ARSLearner(env_seed,
                     env_name,
                     policy_params,
                     l_rate,
                     num_workers,
                     noise,
                     delta_std,
                     num_iter)

weight = Learner.train(num_iter,num_rollouts,max_b)
policy = Policy(policy_params)
policy.update_weights(weight)

total time of one step: 0.4802379608154297 , reward= 29.525498369394363 , iter  0  done
total time of one step: 0.41430187225341797 , reward= 20.511263623480033 , iter  1  done
total time of one step: 0.4304158687591553 , reward= -0.051531159902274457 , iter  2  done
total time of one step: 0.45241403579711914 , reward= -21.977534734678567 , iter  3  done
total time of one step: 0.41699671745300293 , reward= 4.031195165945814 , iter  4  done
total time of one step: 0.3592109680175781 , reward= -18.618199686473446 , iter  5  done
total time of one step: 0.38030004501342773 , reward= 23.178078063882733 , iter  6  done
total time of one step: 0.5099830627441406 , reward= 21.624552633034575 , iter  7  done
total time of one step: 0.49422621726989746 , reward= 14.51543911178507 , iter  8  done
total time of one step: 0.4513881206512451 , reward= 22.643714401782468 , iter  9  done
total time of one step: 0.48417186737060547 , reward= 18.374841914065687 , iter  10  done
total time of one step

total time of one step: 0.6488900184631348 , reward= 8.285605332153562 , iter  93  done
total time of one step: 0.49552273750305176 , reward= 46.43774927196545 , iter  94  done
total time of one step: 0.48382019996643066 , reward= 7.45574091648319 , iter  95  done
total time of one step: 0.5097489356994629 , reward= 7.702238774571546 , iter  96  done
total time of one step: 0.5691239833831787 , reward= -73.17612229591958 , iter  97  done
total time of one step: 0.4764981269836426 , reward= 37.55795884318855 , iter  98  done
total time of one step: 0.4551277160644531 , reward= 41.64106242733499 , iter  99  done
total time of one step: 0.4657149314880371 , reward= 48.85766454537122 , iter  100  done
total time of one step: 0.4062068462371826 , reward= 80.91602141182183 , iter  101  done
total time of one step: 0.38386106491088867 , reward= 33.98927044386185 , iter  102  done
total time of one step: 0.5029840469360352 , reward= 57.61913677281485 , iter  103  done
total time of one step: 0

total time of one step: 0.3670461177825928 , reward= 37.11285110090698 , iter  184  done
total time of one step: 0.39127182960510254 , reward= -8.656600449897228 , iter  185  done
total time of one step: 0.4491848945617676 , reward= -20.24884445651602 , iter  186  done
total time of one step: 0.4227461814880371 , reward= -29.698491607513283 , iter  187  done
total time of one step: 0.362929105758667 , reward= 10.29349166188302 , iter  188  done
total time of one step: 0.3775331974029541 , reward= -23.599650828559312 , iter  189  done
total time of one step: 0.5121610164642334 , reward= 11.771056955044644 , iter  190  done
total time of one step: 0.3889327049255371 , reward= -29.504967779481266 , iter  191  done
total time of one step: 0.3709909915924072 , reward= 3.287827587459191 , iter  192  done
total time of one step: 0.3989219665527344 , reward= 15.490885572637072 , iter  193  done
total time of one step: 0.4556701183319092 , reward= 9.890549444330329 , iter  194  done
total time 

total time of one step: 0.5314071178436279 , reward= -13.117663431071362 , iter  275  done
total time of one step: 0.43196988105773926 , reward= 15.77375211772985 , iter  276  done
total time of one step: 0.3698909282684326 , reward= 13.035372265283254 , iter  277  done
total time of one step: 0.45595502853393555 , reward= -7.452668831958087 , iter  278  done
total time of one step: 0.4412841796875 , reward= 1.538857454481698 , iter  279  done
total time of one step: 0.3872559070587158 , reward= 7.3545908398045405 , iter  280  done
total time of one step: 0.37319493293762207 , reward= 1.1970465273284652 , iter  281  done
total time of one step: 0.4678981304168701 , reward= -0.15789076576097416 , iter  282  done
total time of one step: 0.3416597843170166 , reward= -14.67579034006063 , iter  283  done
total time of one step: 0.37686705589294434 , reward= 15.926650801237374 , iter  284  done
total time of one step: 0.37026309967041016 , reward= 11.225304756767986 , iter  285  done
total t

total time of one step: 0.4077119827270508 , reward= 15.819645342990388 , iter  366  done
total time of one step: 0.5181057453155518 , reward= 6.93561614980554 , iter  367  done
total time of one step: 0.4174041748046875 , reward= -0.1768101238453724 , iter  368  done
total time of one step: 0.38648080825805664 , reward= 37.18585358218898 , iter  369  done
total time of one step: 0.45650792121887207 , reward= 6.854295984077201 , iter  370  done
total time of one step: 0.4140357971191406 , reward= 26.039876832818752 , iter  371  done
total time of one step: 0.4209709167480469 , reward= 19.637429029412324 , iter  372  done
total time of one step: 0.38123488426208496 , reward= 7.3554492622002865 , iter  373  done
total time of one step: 0.5756361484527588 , reward= -22.97638993366586 , iter  374  done
total time of one step: 0.4093759059906006 , reward= -10.74901355858192 , iter  375  done
total time of one step: 0.3751640319824219 , reward= 16.13451637953352 , iter  376  done
total time 

total time of one step: 0.3980679512023926 , reward= -18.915626325712285 , iter  457  done
total time of one step: 0.441087007522583 , reward= -23.922411440990555 , iter  458  done
total time of one step: 0.5095338821411133 , reward= -7.505851354769509 , iter  459  done
total time of one step: 0.36439013481140137 , reward= 16.447498742719002 , iter  460  done
total time of one step: 0.40061521530151367 , reward= 34.56828300697942 , iter  461  done
total time of one step: 0.46753668785095215 , reward= 12.971112168764005 , iter  462  done
total time of one step: 0.39557790756225586 , reward= 14.135256958993795 , iter  463  done
total time of one step: 0.3677489757537842 , reward= 6.936876266976695 , iter  464  done
total time of one step: 0.4166891574859619 , reward= -11.491855657259922 , iter  465  done
total time of one step: 0.454730749130249 , reward= 45.98105426257185 , iter  466  done
total time of one step: 0.4169480800628662 , reward= 36.42479079015188 , iter  467  done
total tim

total time of one step: 0.3952829837799072 , reward= -21.759987233241354 , iter  548  done
total time of one step: 0.35750603675842285 , reward= 22.506759960869008 , iter  549  done
total time of one step: 0.3401758670806885 , reward= -19.54404468617298 , iter  550  done
total time of one step: 0.44925999641418457 , reward= -10.240196686693833 , iter  551  done
total time of one step: 0.395449161529541 , reward= -15.754174255367895 , iter  552  done
total time of one step: 0.35227203369140625 , reward= -27.292869123222022 , iter  553  done
total time of one step: 0.4073910713195801 , reward= 18.79857031756833 , iter  554  done
total time of one step: 0.44003987312316895 , reward= -21.0816023860183 , iter  555  done
total time of one step: 0.33141398429870605 , reward= -20.973085672193488 , iter  556  done
total time of one step: 0.45821499824523926 , reward= 12.98867243234639 , iter  557  done
total time of one step: 0.5130929946899414 , reward= 26.702956583400844 , iter  558  done
tot

total time of one step: 0.42107701301574707 , reward= 25.85050123365483 , iter  639  done
total time of one step: 0.3386251926422119 , reward= -15.198265494248863 , iter  640  done
total time of one step: 0.46048998832702637 , reward= -11.02623293834399 , iter  641  done
total time of one step: 0.42888593673706055 , reward= -21.822484739686534 , iter  642  done
total time of one step: 0.37956714630126953 , reward= 22.873898922850998 , iter  643  done
total time of one step: 0.4115028381347656 , reward= 13.139322508306197 , iter  644  done
total time of one step: 0.44920992851257324 , reward= -20.006349596113186 , iter  645  done
total time of one step: 0.4645121097564697 , reward= 7.1411206058769165 , iter  646  done
total time of one step: 0.371229887008667 , reward= -15.491626249034121 , iter  647  done
total time of one step: 0.377500057220459 , reward= -22.52346643684952 , iter  648  done
total time of one step: 0.45836710929870605 , reward= 26.19528548464192 , iter  649  done
tota

total time of one step: 0.5311510562896729 , reward= 18.09657277303809 , iter  730  done
total time of one step: 0.36986494064331055 , reward= 13.587510005045813 , iter  731  done
total time of one step: 0.36028385162353516 , reward= 22.082002110754654 , iter  732  done
total time of one step: 0.3867628574371338 , reward= 24.937057662324207 , iter  733  done
total time of one step: 0.45967888832092285 , reward= 28.548330364355785 , iter  734  done
total time of one step: 0.37251996994018555 , reward= -21.728005494707652 , iter  735  done
total time of one step: 0.3867471218109131 , reward= 28.78159364719294 , iter  736  done
total time of one step: 0.41892385482788086 , reward= -19.023464707492952 , iter  737  done
total time of one step: 0.5839231014251709 , reward= -14.145561783364649 , iter  738  done
total time of one step: 0.32799196243286133 , reward= -18.145387417124507 , iter  739  done
total time of one step: 0.3915541172027588 , reward= -24.405695836406895 , iter  740  done
t

total time of one step: 0.4902329444885254 , reward= -1.620157195914815 , iter  821  done
total time of one step: 0.4171602725982666 , reward= 12.074442065801888 , iter  822  done
total time of one step: 0.4400620460510254 , reward= 9.281807315907521 , iter  823  done
total time of one step: 0.6284940242767334 , reward= 27.338682682339577 , iter  824  done
total time of one step: 0.44213294982910156 , reward= 2.4391866358792273 , iter  825  done
total time of one step: 0.4485890865325928 , reward= 5.6401772418542615 , iter  826  done
total time of one step: 0.5106279850006104 , reward= 13.563419691095824 , iter  827  done
total time of one step: 0.4766561985015869 , reward= 13.901962517954075 , iter  828  done
total time of one step: 0.6199908256530762 , reward= 14.324653899862277 , iter  829  done
total time of one step: 0.7505228519439697 , reward= 14.95086821412744 , iter  830  done
total time of one step: 0.5236208438873291 , reward= 8.798042101060728 , iter  831  done
total time o

total time of one step: 0.5970239639282227 , reward= 9.282648168730129 , iter  913  done
total time of one step: 0.5427248477935791 , reward= 10.6958833500856 , iter  914  done
total time of one step: 0.4807450771331787 , reward= 13.176766982273792 , iter  915  done
total time of one step: 0.5028598308563232 , reward= 17.24902891923484 , iter  916  done
total time of one step: 0.47165489196777344 , reward= 12.051704021165559 , iter  917  done
total time of one step: 0.4133119583129883 , reward= 5.5205530138968975 , iter  918  done
total time of one step: 0.5070850849151611 , reward= 11.479318722897961 , iter  919  done
total time of one step: 0.5063090324401855 , reward= 4.8285046365343085 , iter  920  done
total time of one step: 0.41242384910583496 , reward= 3.770051065287119 , iter  921  done
total time of one step: 0.4223458766937256 , reward= 26.11559032047639 , iter  922  done
total time of one step: 0.5224981307983398 , reward= 5.887173799531239 , iter  923  done
total time of o

In [19]:
env = gym.make(env_name)

weight = Learner.estimator.policy.get_weights()
returns = []
observations = []
actions = []
isRender = True

for i in range(10):
    print('iter', i)
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    while not done:
        action = np.dot(weight, obs)
        observations.append(obs)
        actions.append(action)


        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        if isRender:
            env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, env.spec.timestep_limit))
        if steps >= env.spec.timestep_limit:
            break
    returns.append(totalr)

print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))


iter 0
Creating window glfw
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 1
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 2
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 3
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 4
100/1000
200/1000
300/1000
400/1000


KeyboardInterrupt: 

In [None]:
env_name = 'Swimmer-v2'
env = gym.make(env_name)
env.render()
env.close()
print(env.action_space.shape)

In [10]:
env_name = 'Swimmer-v2'
env = gym.make(env_name)
ac_dim = env.observation_space.shape[0]
ob_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
policy = Policy(policy_params)
weight = policy.get_weights()

deltas_id = create_shared_noise()
deltas = SharedNoiseTable(deltas_id, seed = 12345)

dx, delta = deltas.get_delta(weight.size)
print(delta.shape)
delta = (0.02 * delta).reshape(weight.shape)
print(delta.shape)

(16,)
(8, 2)
