In [63]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import time
import os

In [84]:
def create_shared_noise():
    """
    Create a large array of noise to be shared by all workers. Used 
    for avoiding the communication of the random perturbations delta.
    """

    seed = 12345
    count = 250000000
    noise = np.random.RandomState(seed).randn(count).astype(np.float64)
    return noise


class SharedNoiseTable(object):
    def __init__(self, noise, seed = 11):

        self.rg = np.random.RandomState(seed)
        self.noise = noise
        assert self.noise.dtype == np.float64

    def get(self, i, dim):
        return self.noise[i:i + dim]

    def sample_index(self, dim):
        return self.rg.randint(0, len(self.noise) - dim + 1)

    def get_delta(self, dim):
        idx = self.sample_index(dim)
        return idx, self.get(idx, dim)

In [85]:
class Worker(object):
    """ 
    Object class for parallel rollout generation.
    """

    def __init__(self,
                 env_name='',
                 policy_params=None,
                 delta_std=0.02,
                 ):
        
        # initialize OpenAI environment for each worker
        
        self.env = gym.make(env_name)
        #self.deltas = SharedNoiseTable(deltas,seed=env_seed + 7 * np.random.randint(1,10))
        self.policy = Policy(policy_params)
        self.delta_std = delta_std
        
    def rollout(self):
        ob = env.reset()
        total_ob = [ob]
        total_reward = 0
        for t in itertools.count():
            action = self.policy.get_action(ob)
            ob,reward,done,_ = env.step(action)
            total_step = t
            total_reward += reward
            if done:
                break
        return total_reward

    def do_rollout(self, weight, num_rollouts):
        rollout_rewards,deltas_list= [],[]
        steps = 0

        for i_rollout in range(num_rollouts):

            delta = np.random.randn(weight.size).reshape(weight.shape)
            deltas_list.append(delta)
            # compute reward and number of timesteps used for positive perturbation rollout
            self.policy.update_weights(weight + self.delta_std * delta)
            pos_reward  = self.rollout()

            # compute reward and number of timesteps used for negative pertubation rollout
            self.policy.update_weights(weight - self.delta_std * delta)
            neg_reward = self.rollout() 

            rollout_rewards.append([pos_reward, neg_reward])

        return {'deltas_list': deltas_list, 'rollout_rewards': rollout_rewards}

In [86]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self, env_name='HalfCheetah-v2',
                 policy_params=None,
                 l_rate=0.02,
                 num_workers=1,
                 delta_std=0.02, 
                 num_iter=1000,
                ):
        #self.deltas = SharedNoiseTable(deltas_id, seed = seed + 3)
        self.l_rate = l_rate
        self.tworker = Worker(env_name=env_name,
                              policy_params=policy_params,
                              delta_std=delta_std)
        
        self.workers = [Worker(env_name=env_name,
                               policy_params=policy_params,
                               delta_std=delta_std) for i in range(num_workers)]
        
    def train(self,num_iter,num_rollouts,max_b):
        
        weight = self.tworker.policy.get_weights()

        for t in range(num_iter):
            t1 = time.time() 
            result_list = [worker.do_rollout(weight,num_rollouts) for worker in self.workers]
            
            #Gather the result
            
            result_dict = result_list[0]
            roll_reward = result_dict['rollout_rewards']
            roll_delta = result_dict['deltas_list']
            
            r_list = zip(roll_reward,roll_delta)
            r_list = sorted(r_list,key=lambda tup: max(tup[0][0],tup[0][1]))
            r_list = r_list[0:max_b]
            
            #Update the weight
            reward_list = []
            cum_diff = np.zeros(weight.shape)
            for roll_r,delta in r_list:
                cum_diff += (roll_r[0] - roll_r[1]) * delta
                reward_list.extend([roll_r[0],roll_r[1]])
                
            reward_list = np.array(reward_list)
            step = self.l_rate / max_b / np.std(reward_list)
            weight += step * cum_diff
            
            self.tworker.policy.update_weights(weight)
            creward= self.tworker.rollout()
            t2 = time.time()
            
            print('total time of one step:', t2 - t1,', reward=' ,creward)      
            print('iter ', t,' done')
        return weight

In [87]:
class Policy(object):
    def __init__(self, policy_params):

        self.ob_dim = policy_params['ob_dim']
        self.ac_dim = policy_params['ac_dim']
        self.weight = np.zeros([self.ac_dim,ob_dim])
    def update_weights(self, new_weight):
        self.weight = new_weight
        return
    
    def get_weights(self):
        return self.weight
    
    def get_action(self,ob):
        return np.matmul(self.weight,ob)

In [88]:
#Test the algorithm
env_name = 'Swimmer-v2'
env = gym.make(env_name)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
l_rate=0.02
num_workers=1
delta_std=0.01
num_iter=1000
#seed=123
mode=1
num_rollouts = 1
max_b=1
Learner = ARSLearner(env_name,
                     policy_params,
                     l_rate,
                     num_workers,
                     delta_std,
                     num_iter)

weight = Learner.train(num_iter,num_rollouts,max_b)
policy = Policy(policy_params)
policy.update_weights(weight)

total time of one step: 0.3754289150238037 , reward= 17.472066472287185
iter  0  done
total time of one step: 0.3979618549346924 , reward= 11.881805256448452
iter  1  done
total time of one step: 0.40702199935913086 , reward= 36.127401232935036
iter  2  done
total time of one step: 0.4505281448364258 , reward= 26.014739923077446
iter  3  done
total time of one step: 0.3680760860443115 , reward= 41.05683282684685
iter  4  done
total time of one step: 0.3510298728942871 , reward= 22.774290666462406
iter  5  done
total time of one step: 0.4340701103210449 , reward= 10.434295445859243
iter  6  done
total time of one step: 0.4016151428222656 , reward= 34.39523208620195
iter  7  done
total time of one step: 0.35937070846557617 , reward= 20.716696102952742
iter  8  done
total time of one step: 0.3809230327606201 , reward= 21.42337678979051
iter  9  done
total time of one step: 0.45606017112731934 , reward= 41.714371760918944
iter  10  done
total time of one step: 0.400299072265625 , reward= 0

total time of one step: 0.44226908683776855 , reward= 27.505881857798915
iter  95  done
total time of one step: 0.35379600524902344 , reward= 23.051436820696367
iter  96  done
total time of one step: 0.4372518062591553 , reward= 25.0366285597029
iter  97  done
total time of one step: 0.4277360439300537 , reward= 24.78493209975506
iter  98  done
total time of one step: 0.4667487144470215 , reward= 10.60264173503149
iter  99  done
total time of one step: 0.3859429359436035 , reward= 29.048327235116478
iter  100  done
total time of one step: 0.37778687477111816 , reward= 25.47206044197963
iter  101  done
total time of one step: 0.4928879737854004 , reward= 32.568959800412784
iter  102  done
total time of one step: 0.4274418354034424 , reward= 25.684280280665252
iter  103  done
total time of one step: 0.428865909576416 , reward= -3.4182706340519404
iter  104  done
total time of one step: 0.363109827041626 , reward= 36.22979572576566
iter  105  done
total time of one step: 0.532820940017700

total time of one step: 0.36995506286621094 , reward= 23.259522597308035
iter  188  done
total time of one step: 0.3539459705352783 , reward= 16.81271699531611
iter  189  done
total time of one step: 0.3187530040740967 , reward= 31.984805639578077
iter  190  done
total time of one step: 0.3374011516571045 , reward= 12.334475682131417
iter  191  done
total time of one step: 0.46479105949401855 , reward= 33.14861590549964
iter  192  done
total time of one step: 0.38370680809020996 , reward= 7.105989948740172
iter  193  done
total time of one step: 0.36763787269592285 , reward= 34.41988447591464
iter  194  done
total time of one step: 0.330718994140625 , reward= 31.95944500425319
iter  195  done
total time of one step: 0.4486689567565918 , reward= 24.464340981929663
iter  196  done
total time of one step: 0.4745657444000244 , reward= 25.705897219208158
iter  197  done
total time of one step: 0.38999009132385254 , reward= 34.21803032794351
iter  198  done
total time of one step: 0.33748984

total time of one step: 0.341414213180542 , reward= 216.06590793780518
iter  282  done
total time of one step: 0.34049510955810547 , reward= 125.81067963758264
iter  283  done
total time of one step: 0.3722057342529297 , reward= 218.7923827450481
iter  284  done
total time of one step: 0.34413599967956543 , reward= 112.23856679883005
iter  285  done
total time of one step: 0.28422021865844727 , reward= 164.0478247497081
iter  286  done
total time of one step: 0.3050079345703125 , reward= 128.7649102391133
iter  287  done
total time of one step: 0.3582489490509033 , reward= 95.09596708294602
iter  288  done
total time of one step: 0.3722047805786133 , reward= 132.54457699141037
iter  289  done
total time of one step: 0.3276190757751465 , reward= 116.0071701786091
iter  290  done
total time of one step: 0.2887279987335205 , reward= 116.92538031264145
iter  291  done
total time of one step: 0.2854130268096924 , reward= 114.44116817223221
iter  292  done
total time of one step: 0.312709093

total time of one step: 0.3331568241119385 , reward= 152.77312627263922
iter  376  done
total time of one step: 0.3258202075958252 , reward= 265.1002761933773
iter  377  done
total time of one step: 0.3943610191345215 , reward= 249.62212123798918
iter  378  done
total time of one step: 0.3680448532104492 , reward= 258.36890496187095
iter  379  done
total time of one step: 0.34197402000427246 , reward= 240.73454887522922
iter  380  done
total time of one step: 0.3229711055755615 , reward= 243.32243191879047
iter  381  done
total time of one step: 0.36541128158569336 , reward= 218.21985497855627
iter  382  done
total time of one step: 0.3985710144042969 , reward= 225.14681796932544
iter  383  done
total time of one step: 0.2970130443572998 , reward= 219.28837232034758
iter  384  done
total time of one step: 0.3166346549987793 , reward= 248.49099878948283
iter  385  done
total time of one step: 0.29273080825805664 , reward= 242.02444184570888
iter  386  done
total time of one step: 0.3997

total time of one step: 0.36111998558044434 , reward= 158.72604063448097
iter  470  done
total time of one step: 0.37653517723083496 , reward= 186.35637113177538
iter  471  done
total time of one step: 0.3408982753753662 , reward= 199.5174755314568
iter  472  done
total time of one step: 0.32192087173461914 , reward= 201.8651970858235
iter  473  done
total time of one step: 0.36325693130493164 , reward= 187.97418595159303
iter  474  done
total time of one step: 0.4075448513031006 , reward= 213.98264190872897
iter  475  done
total time of one step: 0.3332200050354004 , reward= 190.75951681446674
iter  476  done
total time of one step: 0.329423189163208 , reward= 180.81339753765135
iter  477  done
total time of one step: 0.379241943359375 , reward= 171.42602445580553
iter  478  done
total time of one step: 0.4145357608795166 , reward= 164.8450036792809
iter  479  done
total time of one step: 0.39449286460876465 , reward= 191.42525537169342
iter  480  done
total time of one step: 0.373207

total time of one step: 0.33932995796203613 , reward= 230.335818041503
iter  564  done
total time of one step: 0.36870884895324707 , reward= 236.3215138272793
iter  565  done
total time of one step: 0.3873567581176758 , reward= 205.06342038197346
iter  566  done
total time of one step: 0.43549537658691406 , reward= 216.3527716995277
iter  567  done
total time of one step: 0.3031589984893799 , reward= 219.36522770744142
iter  568  done
total time of one step: 0.2884187698364258 , reward= 230.7702666104734
iter  569  done
total time of one step: 0.31563305854797363 , reward= 223.4405021032901
iter  570  done
total time of one step: 0.36506009101867676 , reward= 209.98695339260718
iter  571  done
total time of one step: 0.3339970111846924 , reward= 227.89398908442152
iter  572  done
total time of one step: 0.29331040382385254 , reward= 243.4497032184612
iter  573  done
total time of one step: 0.2879180908203125 , reward= 246.21417176574488
iter  574  done
total time of one step: 0.2939081

total time of one step: 0.36746883392333984 , reward= 66.00399012586087
iter  658  done
total time of one step: 0.43415307998657227 , reward= 72.66048697736538
iter  659  done
total time of one step: 0.31084418296813965 , reward= 97.89805566265186
iter  660  done
total time of one step: 0.35568714141845703 , reward= 183.39697218863162
iter  661  done
total time of one step: 0.3763430118560791 , reward= 127.43775547414609
iter  662  done
total time of one step: 0.3855559825897217 , reward= 125.51993269631105
iter  663  done
total time of one step: 0.35938286781311035 , reward= 136.1574264625332
iter  664  done
total time of one step: 0.33362603187561035 , reward= 72.94585037277238
iter  665  done
total time of one step: 0.37932801246643066 , reward= 72.52631945787921
iter  666  done
total time of one step: 0.4439816474914551 , reward= 132.2900857195233
iter  667  done
total time of one step: 0.3146626949310303 , reward= 111.38232467278863
iter  668  done
total time of one step: 0.335638

total time of one step: 0.34476494789123535 , reward= 263.4094701175226
iter  752  done
total time of one step: 0.40818190574645996 , reward= 295.40147242184435
iter  753  done
total time of one step: 0.4121088981628418 , reward= 292.5388735356344
iter  754  done
total time of one step: 0.3589310646057129 , reward= 255.9700599826568
iter  755  done
total time of one step: 0.31557393074035645 , reward= 269.8542663427826
iter  756  done
total time of one step: 0.3973541259765625 , reward= 253.95366934993393
iter  757  done
total time of one step: 0.41481900215148926 , reward= 279.6412117163592
iter  758  done
total time of one step: 0.3233370780944824 , reward= 304.5650134966331
iter  759  done
total time of one step: 0.33395981788635254 , reward= 308.2477378475382
iter  760  done
total time of one step: 0.30098485946655273 , reward= 316.1676814770009
iter  761  done
total time of one step: 0.42564892768859863 , reward= 313.89023803405956
iter  762  done
total time of one step: 0.3467850

total time of one step: 0.29894208908081055 , reward= 348.4218577846899
iter  846  done
total time of one step: 0.34934186935424805 , reward= 348.8212258376128
iter  847  done
total time of one step: 0.34727907180786133 , reward= 340.53501238328346
iter  848  done
total time of one step: 0.382551908493042 , reward= 341.6688672063575
iter  849  done
total time of one step: 0.4077591896057129 , reward= 333.753187525867
iter  850  done
total time of one step: 0.368480920791626 , reward= 349.86075171578
iter  851  done
total time of one step: 0.3531370162963867 , reward= 345.0434059127269
iter  852  done
total time of one step: 0.44995903968811035 , reward= 344.41959405157786
iter  853  done
total time of one step: 0.43712782859802246 , reward= 337.12072662814455
iter  854  done
total time of one step: 0.3456568717956543 , reward= 338.12628051804313
iter  855  done
total time of one step: 0.3638629913330078 , reward= 332.05465864813544
iter  856  done
total time of one step: 0.352193117141

total time of one step: 0.3310971260070801 , reward= 318.9184486400851
iter  940  done
total time of one step: 0.32173895835876465 , reward= 338.61716413702277
iter  941  done
total time of one step: 0.4299159049987793 , reward= 346.2519128871984
iter  942  done
total time of one step: 0.385875940322876 , reward= 335.7616798890242
iter  943  done
total time of one step: 0.33879899978637695 , reward= 333.3553151686149
iter  944  done
total time of one step: 0.3152329921722412 , reward= 341.16266992220335
iter  945  done
total time of one step: 0.5537285804748535 , reward= 341.8805996738748
iter  946  done
total time of one step: 0.40126490592956543 , reward= 335.85809859102403
iter  947  done
total time of one step: 0.3437221050262451 , reward= 341.78632058513625
iter  948  done
total time of one step: 0.38565516471862793 , reward= 345.64376805436973
iter  949  done
total time of one step: 0.42778706550598145 , reward= 345.1964548173506
iter  950  done
total time of one step: 0.35996794

In [89]:
env = gym.make(env_name)

returns = []
observations = []
actions = []
isRender = True

for i in range(10):
    print('iter', i)
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    while not done:
        action = np.dot(weight, obs)
        observations.append(obs)
        actions.append(action)


        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        if isRender:
            env.render()
        if steps % 100 == 0: print("%i/%i"%(steps, env.spec.timestep_limit))
        if steps >= env.spec.timestep_limit:
            break
    returns.append(totalr)

print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))


iter 0
Creating window glfw
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 1
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 2
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 3
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 4
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 5
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
iter 6
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000


NameError: name 'exit' is not defined

In [None]:
env_name = 'Swimmer-v2'
env = gym.make(env_name)
env.render()
env.close()
print(env.action_space.shape)

In [10]:
env_name = 'Swimmer-v2'
env = gym.make(env_name)
ac_dim = env.observation_space.shape[0]
ob_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
policy = Policy(policy_params)
weight = policy.get_weights()

deltas_id = create_shared_noise()
deltas = SharedNoiseTable(deltas_id, seed = 12345)

dx, delta = deltas.get_delta(weight.size)
print(delta.shape)
delta = (0.02 * delta).reshape(weight.shape)
print(delta.shape)

(16,)
(8, 2)
