In [1]:
import sys
import itertools
import collections
import matplotlib
import numpy as np
import tensorflow as tf
import gym
import roboschool

import sklearn.pipeline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler


%matplotlib inline
matplotlib.style.use('ggplot')

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.envs.make("RoboschoolWalker2d-v1")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Environment '<class 'roboschool.gym_mujoco_walkers.RoboschoolWalker2d'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m


In [3]:
# settings

state_size=env.observation_space.shape[0]
action_size=env.action_space.shape[0]
gamma=.95
LEARNING_RATE=0.01
discount_factor=1.0

In [4]:
# Brain

class PolicyApproximator:
    
    def __init__(self):
        
        # Tensorflow settings

        tf.reset_default_graph()
        
        # Building the Network
        
        self.states=tf.placeholder(name='states',shape=[None,state_size],dtype=tf.float32)
        self.actions=tf.placeholder(name='actions',shape=[None,action_size],dtype=tf.float32)
        self.target=tf.placeholder(name='target',dtype=tf.float32)

                
        self.layer_mu1=tf.layers.dense(activation=None,units=10,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_mu2=tf.layers.dense(activation=None,units=action_size,inputs=self.layer_mu1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
       
        self.layer_sigma1=tf.layers.dense(activation=None,units=10,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_sigma2=tf.layers.dense(activation=None,units=action_size,inputs=self.layer_sigma1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
    
        self.layer_sigma2 = tf.nn.softplus(self.layer_sigma2) + 1e-5
        
        self.normal_dist = tf.distributions.Normal(self.layer_mu2, self.layer_sigma2)
        self.actions_ = self.normal_dist.sample(1)
        self.actions_ = tf.clip_by_value(self.actions_, env.action_space.low[0], env.action_space.high[0])
        self.actions_=tf.squeeze(input=self.actions_)
        
        

        self.neg_log = -tf.reduce_sum(self.normal_dist.log_prob(self.actions_)) 
        self.loss=tf.reduce_mean(self.neg_log* self.target)
        # Add cross entropy cost to encourage exploration
        self.loss -= 1e-1 * self.normal_dist.entropy()
            
        
        self.optimizer=tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.loss)
        
        
        self.init=tf.global_variables_initializer()
        
        self.session=tf.Session()
        self.session.run(self.init)
    
    def predict(self,state):
        
        state=np.reshape(state,newshape=(1,state_size))
        return self.session.run(self.actions_,feed_dict={self.states:state})
        
    
    def train(self,states,target,actions):

        states=np.reshape(states,newshape=(1,state_size))
        actions=np.reshape(actions,newshape=(1,action_size))
  
        _,loss=self.session.run([self.optimizer,self.loss],
                               feed_dict={self.states:states,self.actions:actions,self.target:target})
            
        return loss
    
    def close(self):
        self.session.close()

In [5]:
class ValueApproximator():
    """
    Value Function approximator. 
    """
    
    def __init__(self, learning_rate=0.1, scope="value_approximator"):
        with tf.variable_scope(scope):
            self.states = tf.placeholder(tf.float32, [None,state_size], "states")
            self.targets = tf.placeholder(dtype=tf.float32, name="targets")

            # This is just table lookup estimator
            self.layer1=tf.layers.dense(activation=None,units=20,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.output_layer=tf.layers.dense(activation=None,units=1,inputs=self.layer1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
       

            self.loss = tf.squared_difference(self.output_layer, self.targets)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.loss) 
            
            
                    
            self.init=tf.global_variables_initializer()
        
            self.session=tf.Session()
            self.session.run(self.init)
    
    def predict(self, states):
        states=np.reshape(states,newshape=(1,state_size))

        return self.session.run(self.output_layer, { self.states: states })

    def train(self, states, targets):
        states=np.reshape(states,newshape=(1,state_size))

        _, loss = self.session.run([self.optimizer, self.loss], feed_dict={ self.states: states, self.targets: targets })
        return loss

In [6]:
policy_approximator=PolicyApproximator()
value_approximator=ValueApproximator()

In [8]:

G=[]
mean_G_all=[]

Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
for ep in range(100000):
    episode_rewards=[]
    episode=[]
    
    state=env.reset()
    while True:
        env.render()
        action=policy_approximator.predict(state)
        print(action)
        
        next_state,reward,done,info=env.step(action)
        
        
        # Keep track of the transition
        episode.append(Transition(state=state, action=action, 
                                  reward=reward, next_state=next_state, done=done))
            
        
        episode_rewards.append(reward)
        
        # Calculate TD Target
        value_next = value_approximator.predict(next_state)
        td_target = reward + discount_factor * value_next
        td_error = td_target - value_approximator.predict(state)
            
        # Update the value estimator
        value_approximator.train(state, td_target)
            
        # Update the policy estimator
        # using the td error as our advantage estimate
        policy_approximator.train(state, td_error, action)
            

        state=next_state    
        
        if done:
            # Calculate sum of the rewards
            episode_rewards_sum = sum(episode_rewards)     
            G.append(episode_rewards_sum)
            total_G = np.sum(G)
            maximumReturn = np.amax(G)
            
            if ep%100==0:
                
                print("==========================================")
                print("Episode: ", ep)
                print("Rewards: ", episode_rewards_sum)
                print("Max reward so far: ", maximumReturn)
                # Mean reward
                mean_G = np.divide(total_G, ep+1)
                mean_G_all.append(mean_G)
                print("Mean Reward", mean_G)
    
            break
            
        

[-0.23197001  0.7233214  -0.02409253  0.27918836  0.5972785   0.7331096 ]
[-1.          1.         -0.2511039   0.20429811 -0.8338735   0.20161802]
[-1.          1.         -0.5492358   1.         -0.08398584  0.7313343 ]
[-1.          1.         -0.6472949   1.          0.1924558   0.48427066]
[-1.          1.         -0.59018356  1.          0.9068792   0.8672558 ]
[-1.          1.         -0.82968503  1.          0.24480972  0.8608326 ]
[-1.          1.         -0.88749903  1.          1.          0.7327295 ]
[-1.          1.         -0.9104085   1.          0.37135065  0.34960306]
[-1.          1.         -1.          1.          1.          0.44863215]
[-1.          1.         -0.96446335  1.          0.27451745  0.62236273]
[-1.          1.         -0.6211193   1.          0.6598036   0.38793385]
[-1.          1.         -0.5925622   1.          0.74090135  0.18876508]
[-1.          1.         -0.5817112   1.         -0.6079784   0.04844797]
[-1.          1.         -0.48771334  

AssertionError: 