In [1]:
import gym
import itertools
import matplotlib
import numpy as np
import time
import os

In [2]:
# Normalizing the states

class Normalizer():
    
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
    
    def observe(self, x):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std
        #return inputs

In [3]:
class Worker(object):
    """ 
    Object class for parallel rollout generation.
    """

    def __init__(self,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 delta_std=0.02,
                 ):
        
        # initialize OpenAI environment for each worker
        
        self.env = gym.make(env_name)
        self.delta_std = delta_std
        
    def explore(self, env, normalizer, policy, direction = None, delta = None):
        state = env.reset()
        done = False
        num_plays = 0.
        sum_rewards = 0
        while not done and num_plays < 1000:
            normalizer.observe(state)
            state = normalizer.normalize(state)
            action = policy.evaluate(state, delta, self.delta_std, direction)
            state, reward, done, _ = env.step(action)
            #reward = max(min(reward, 1), -1)
            sum_rewards += reward
            num_plays += 1
        return sum_rewards

    def do_rollout(self, num_rollouts, normalizer, policy):    
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas(num_rollouts)
        positive_rewards = [0] * num_rollouts
        negative_rewards = [0] * num_rollouts

        # Getting the positive rewards in the positive directions
        for k in range(num_rollouts):
            positive_rewards[k] = self.explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
        # Getting the negative rewards in the negative/opposite directions
        for k in range(num_rollouts):
            negative_rewards[k] = self.explore(env, normalizer, policy, direction = "negative", delta = deltas[k])

        return {'deltas': deltas, 'positive_rewards': positive_rewards, 'negative_rewards': negative_rewards}

In [7]:
class ARSLearner(object):
    """ 
    Object class implementing the ARS algorithm.
    """
    def __init__(self,
                 env_name='Swimmer-v2',
                 policy_params=None,
                 learning_rate=0.02,
                 delta_std=0.02, 
                 num_iter=1000,
                ):
        self.policy = Policy(policy_params['ob_dim'], policy_params['ac_dim'])
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.normalizer = Normalizer(policy_params['ob_dim'])          
        self.worker = Worker(env_name=env_name,
                             policy_params=policy_params,
                             delta_std=delta_std)
        
    def train(self,num_rollouts,max_b):

        for t in range(self.num_iter):
            t1 = time.time() 
            result_dict = self.worker.do_rollout(num_rollouts, self.normalizer, self.policy) 
            #Gather the result

            deltas = result_dict['deltas']
            positive_rewards = result_dict['positive_rewards']
            negative_rewards = result_dict['negative_rewards']

            #Update the weight
            all_rewards = np.array(positive_rewards + negative_rewards)
            sigma_r = all_rewards.std()

            # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:max_b]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]

            # Updating our policy
            self.policy.update(rollouts, sigma_r, self.learning_rate, max_b)

            # Printing the final reward of the policy after the update
            reward_evaluation = self.worker.explore(env, self.normalizer, self.policy)
            t2 = time.time() 
            print('total time of one step:', t2 - t1,', reward=' ,reward_evaluation,', iter ', t,' done')      
    
    def evaluate(self,test_num,isRender):
        returns = []
        print(self.policy.theta)
        for i in range(test_num):
            print('iter', i)
            state = env.reset()
            done = False
            num_plays = 0
            sum_rewards = 0
            while not done and num_plays < 1000:
                state = self.normalizer.normalize(state)
                action = self.policy.evaluate(state, None, delta_std, None)
                state, reward, done, _ = env.step(action)
                sum_rewards += reward
                num_plays += 1
                if isRender:
                    env.render()   
                if num_plays % 100 == 0: print("%i/%i"%(steps, env.spec.timestep_limit))

            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))


In [8]:

# Building the AI

class Policy():
    
    def __init__(self, input_size, output_size):
        self.theta = np.zeros((output_size, input_size))
    
    def evaluate(self, input, delta = None, delta_std =  0.02, direction = None):
        if direction is None:
            return self.theta.dot(input)
        elif direction == "positive":
            return (self.theta + delta_std*delta).dot(input)
        else:
            return (self.theta - delta_std*delta).dot(input)
    
    def sample_deltas(self,num_rollouts):
        return [np.random.randn(*self.theta.shape) for _ in range(num_rollouts)]
    
    def update(self, rollouts, sigma_r,learning_rate,max_b):
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
        self.theta += learning_rate / (max_b * sigma_r) * step

# Exploring the policy on one specific direction and over one episode

In [9]:
#Test the algorithm
env_name = 'HalfCheetah-v2'
env = gym.make(env_name)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
policy_params={'ob_dim':ob_dim,'ac_dim':ac_dim}
learning_rate=0.02
delta_std=0.03
num_iter=1000
num_rollouts = 8
max_b=8
Learner = ARSLearner(env_name,
                     policy_params,
                     learning_rate,
                     delta_std, 
                     num_iter)

Learner.train(num_rollouts,max_b)

total time of one step: 3.100667953491211 , reward= -0.09666300303047773 , iter  0  done
total time of one step: 4.585129022598267 , reward= -0.9899459044483576 , iter  1  done
total time of one step: 3.810541868209839 , reward= 0.6797305277106737 , iter  2  done
total time of one step: 3.845560073852539 , reward= 1.2707291036271393 , iter  3  done
total time of one step: 3.269714832305908 , reward= 1.5116240143419049 , iter  4  done
total time of one step: 2.9285519123077393 , reward= 2.928415072028285 , iter  5  done
total time of one step: 2.846930980682373 , reward= -0.10433289236716452 , iter  6  done
total time of one step: 3.0134689807891846 , reward= 0.23326727681880127 , iter  7  done
total time of one step: 2.994875907897949 , reward= 373.4531294825746 , iter  8  done
total time of one step: 2.650787115097046 , reward= 173.3880407571362 , iter  9  done
total time of one step: 2.489946126937866 , reward= 14.613119678304253 , iter  10  done
total time of one step: 2.48705792427

KeyboardInterrupt: 

In [1]:
Learner.evaluate(test_num=10,isRender=False)

NameError: name 'Learner' is not defined