In [16]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import time
import os

In [17]:
def create_shared_noise():
    """
    Create a large array of noise to be shared by all workers. Used 
    for avoiding the communication of the random perturbations delta.
    """

    seed = 12345
    count = 250000000
    noise = np.random.RandomState(seed).randn(count).astype(np.float64)
    return noise


class SharedNoiseTable(object):
    def __init__(self, noise, seed = 11):

        self.rg = np.random.RandomState(seed)
        self.noise = noise
        assert self.noise.dtype == np.float64

    def get(self, i, dim):
        return self.noise[i:i + dim]

    def sample_index(self, dim):
        return self.rg.randint(0, len(self.noise) - dim + 1)

    def get_delta(self, dim):
        idx = self.sample_index(dim)
        return idx, self.get(idx, dim)

In [23]:
class Worker(object):
    """ 
    Object class for parallel rollout generation.
    """

    def __init__(self, env_seed,
                 env_name='',
                 policy_params=None,
                 deltas=None,
                 delta_std=0.02,
                 mode=1
                 ):
        
        # initialize OpenAI environment for each worker
        
        self.env = gym.make('env_name')
        self.env.seed(env_seed)
        self.delta = SharedNoiseTable(deltas,seed=env_seed + 7)
        
        self.ob_dim = env.observation_space.shape[0]
        self.ac_dim = env.action_space.shape[0]
        self.policy = Policy(policy_params)
        self.delta_std = delta_std
        
        def rollout(self,mode,mean,Sigma):
            ob = env.reset()
            total_ob = [ob]
            total_reward = 0
            for t in itertools():
                action = self.policy.get_action(ob,mode,mean,Sigma)
                ob,reward,done,_ = env.step(action)
                total_ob.append(ob)
                total_step = t
                total_reward += reward
                if done:
                    break
            return total_reward,total_step,total_ob
        
        def do_rollout(self, weight, num_rollouts,mode,mean,Sigma):
            rollout_rewards,deltas_idx,rollout_ob  = [],[],[]
            steps = 0
            
            for i_rollout in range(num_rollouts):
                
                idx, delta = self.deltas.get_delta(w_policy.size)
             
                delta = (self.delta_std * delta).reshape(w_policy.shape)
                deltas_idx.append(idx)
                # compute reward and number of timesteps used for positive perturbation rollout
                self.policy.update_weights(weight + self.delta_std * delta)
                pos_reward, pos_steps, pos_ob  = self.rollout(mode,mean,Sigma)

                # compute reward and number of timesteps used for negative pertubation rollout
                self.policy.update_weights(weight - self.delta_std * delta)
                neg_reward, neg_steps, neg_ob = self.rollout(mode,mean,Sigma) 
                steps += pos_steps + neg_steps

                rollout_rewards.append([pos_reward, neg_reward])
                rollout_ob.append([pos_ob,neg_ob])
                
            return {'deltas_idx': deltas_idx, 'rollout_rewards': rollout_rewards, "steps" : steps, "rollout_ob": rollout_ob}

In [46]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 l_rate=0.02,
                 num_worker=1,
                 delta_std=0.02, 
                 num_iter=1000,
                 step_size=0.01,
                 seed=123
                ):
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()
        self.deltas = SharedNoiseTable(deltas_id, seed = seed + 3)
        
        self.l_rate = l_rate
        self.num_workers = num_workers
        self.workers = [Worker(self, env_seed,
                               env_name='',
                               deltas=None,
                               delta_std=0.02,
                               mode=1
                               ) for i in range(num_workers)]
    def train(self,num_iter,max_b,mode,policy_params):
        
        weight = np.zeros([policy_params['ob_dim'],policy_params['ac_dim']])
        if mode ==2:
            mean = 0
            Sigma = np.identity(policy_params['ob_dim'])

        for t in range(num_iter):
            t1 = time.time()
            result_list = [worker.do_rollout(weight,num_rollouts,mode,mean,Sigma)]
            t2 = time.time()
            print('total time of one step', t2 - t1)           
            print('iter ', i,' done')
            
            #Gather the result
            ob_list = [iob for iob in ob for _,_,_,ob in result_list.items]
            ob_array = np.array(ob_list)
            mean = np.mean(ob_array)
            Sigma = np.std(ob_array)
            
            reward_list = [(roll_r,delta_id) for delta_id, roll_r,_,_ in result_list.items]
            reward_list = sorted(reward_list,key=lambda tup: max(tup[0][0],tup[0][1]))
            reward_list = reward_list[0:max_b]
            #Update the weight
            for roll_r,delta_id in reward_list:
                cum_diff = (roll_r[0] - roll_r[1]) * self.deltas.get(delta_id,weight.size)
            weight = weight + self.l_rate / np.std(reward_list) * cum_diff
        return weight

In [49]:
class Policy(object):
    def __init__(self, policy_params):

        self.ob_dim = policy_params['ob_dim']
        self.ac_dim = policy_params['ac_dim']
        self.weight = np.zeros([self.ac_dim,ob_dim])
    def update_weights(self, new_weight):
        self.weight = new_weight
        return
    def get_action(self,ob,mode,mean,Sigma):
        if mode == 1:
            return np.matmul(self.weight,ob)
        if mode == 2:
            Sigma_diag = np.diag(np.diag(Sigma))
            Sigma_t = np.sqrt(np.linalg.inv(Sigma_diag))
            return np.matmul(np.matmul(self.weight,Sigma_t),(ob - mean))

In [54]:
#Test the algorithm
env_name = 'HalfCheetah-v2'
env = gym.make(env_name)
ac_dim = env.observation_space.shape[0]
ob_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
l_rate=0.02
max_b=4
num_worker=1,
delta_std=0.02, 
num_iter=1000
step_size=0.01,
seed=123

Learner = ARSLearner(env_name,
                     policy_params,
                     l_rate,
                     max_b,
                     num_worker,
                     delta_std,
                     num_iter,
                     step_size,
                     seed,
                     )
weight = Learner.train(num_iter,max_b,mode,policy_params)
policy = Policy(policy_params)
policy.update_weights(weight)

  result = entry_point.load(False)


DependencyNotInstalled: No module named 'mujoco_py'. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)