In [1]:
import sys
import itertools
import collections
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import gym
import roboschool

%matplotlib inline
matplotlib.style.use('ggplot')

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.envs.make("RoboschoolWalker2d-v1") #RoboschoolWalker2d-v1

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Environment '<class 'roboschool.gym_mujoco_walkers.RoboschoolWalker2d'>' has deprecated methods '_step' and '_reset' rather than 'step' and 'reset'. Compatibility code invoked. Set _gym_disable_underscore_compat = True to disable this behavior.[0m


In [3]:
# settings
state_size=env.observation_space.shape[0]
action_size=env.action_space.shape[0]
gamma=.95
LEARNING_RATE=0.01
LEARNING_RATE2=0.01
discount_factor=0.95

In [4]:
# Brain

class PolicyApproximator:
    
    def __init__(self):
        
        # Tensorflow settings

        tf.reset_default_graph()
        
        # Building the Network
        
        self.states=tf.placeholder(name='states',shape=[None,state_size],dtype=tf.float32)
        self.target=tf.placeholder(name='target',dtype=tf.float32)

        self.layer_mu0=tf.layers.dense(activation=None,units=30,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_mu1=tf.layers.dense(activation=None,units=20,inputs=self.layer_mu0)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_mu2=tf.layers.dense(activation=None,units=action_size,inputs=self.layer_mu1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())

        self.layer_sigma0=tf.layers.dense(activation=None,units=30,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_sigma1=tf.layers.dense(activation=None,units=20,inputs=self.layer_sigma0)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.layer_sigma2=tf.layers.dense(activation=None,units=action_size,
                                          inputs=self.layer_sigma1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
    
        self.layer_sigma2 = tf.nn.softplus(self.layer_sigma2) + 1e-5
        
        self.normal_dist = tf.distributions.Normal(self.layer_mu2, self.layer_sigma2)
        self.actions_ = self.normal_dist.sample(1)
        self.actions_ = tf.clip_by_value(self.actions_, env.action_space.low[0], env.action_space.high[0])
        self.actions_=tf.squeeze(input=self.actions_)
        
        

        self.loss=tf.reduce_mean(tf.reduce_sum(-self.normal_dist.log_prob(self.actions_)* self.target,axis=1))
        # Add cross entropy cost to encourage exploration
        self.loss -= 1e-1 * self.normal_dist.entropy()
            
        
        self.optimizer=tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.loss)
        
        
        self.init=tf.global_variables_initializer()
        
        self.session=tf.Session()
        self.session.run(self.init)
    
    def predict(self,states):
        
        states=np.reshape(states,newshape=(1,state_size))
        return self.session.run(self.actions_,feed_dict={self.states:states})
        
    
    def train(self,states,targets,actions):

        states=np.reshape(states,newshape=(1,state_size))
        actions=np.reshape(actions,newshape=(1,action_size))
  
        _,loss=self.session.run([self.optimizer,self.loss],
                               feed_dict={self.states:states,self.actions_:actions,self.target:targets})
            
        return loss
    
    def close(self):
        self.session.close()

In [5]:
class ValueApproximator():
    """
    Value Function approximator. 
    """
    
    def __init__(self, learning_rate=0.1, scope="value_approximator"):
        with tf.variable_scope(scope):
            self.states = tf.placeholder(tf.float32, [None,state_size], "states")
            self.targets = tf.placeholder(dtype=tf.float32, name="targets")

            # This is just table lookup estimator
            self.layer1=tf.layers.dense(activation=None,units=10,inputs=self.states)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.layer2=tf.layers.dense(activation=None,units=10,inputs=self.layer1)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.layer3=tf.layers.dense(activation=None,units=5,inputs=self.layer2)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.layer4=tf.layers.dense(activation=None,units=3,inputs=self.layer3)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            self.output_layer=tf.layers.dense(activation=None,units=1,inputs=self.layer4)#,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.outpput_layer=tf.squeeze(self.output_layer)

            self.loss = tf.squared_difference(self.output_layer, self.targets)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE2).minimize(self.loss) 
            
            
                    
            self.init=tf.global_variables_initializer()
        
            self.session=tf.Session()
            self.session.run(self.init)
    
    def predict(self, states):
        states=np.reshape(states,newshape=(1,state_size))

        return self.session.run(self.output_layer, { self.states: states })

    def train(self, states, targets):
        states=np.reshape(states,newshape=(1,state_size))

        _, loss = self.session.run([self.optimizer, self.loss], feed_dict={ self.states: states, self.targets: targets })
        return loss

In [6]:
policy_approximator=PolicyApproximator()
value_approximator=ValueApproximator()

In [7]:

G=[]
mean_G_all=[]
number_of_episodes=1000
Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
for ep in range(number_of_episodes):
    episode_rewards=[]
    episode=[]
    
    state=env.reset()
    while True:
        env.render()
        action=policy_approximator.predict(state)
       # print(action)
        next_state,reward,done,info=env.step(action)
        print (reward)
        
        # Keep track of the transition
        episode.append(Transition(state=state, action=action, 
                                  reward=reward, next_state=next_state, done=done))
            
        
        episode_rewards.append(reward)

        state=next_state    
        
        if done:
            # Calculate sum of the rewards
            episode_rewards_sum = sum(episode_rewards)     
            G.append(episode_rewards_sum)
            total_G = np.sum(G)
            maximumReturn = np.amax(G)
            
            if ep%100==0:
                
                print("==========================================")
                print("Episode: ", ep)
                print("Rewards: ", episode_rewards_sum)
                print("Max reward so far: ", maximumReturn)
                # Mean reward
                mean_G = np.divide(total_G, ep+1)
                mean_G_all.append(mean_G)
                print("Mean Reward", mean_G)
    
            break
            
            
    # Go through the episode and make policy updates
    for t, transition in enumerate(episode):
        # The return after this timestep
        total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))
        # Calculate baseline/advantage
        baseline_value = value_approximator.predict(transition.state)            
        advantage = total_return - baseline_value
        # Update our value estimator
        value_approximator.train(transition.state, total_return)
        # Update our policy estimator
        policy_approximator.train(transition.state, advantage, transition.action)         
            
        

0.19579009397566538
0.5697747704587526
0.6708250172014231
1.2213017297865008
1.789248503363342
1.1227394647765323
0.5456792482029413
0.8168414609099273
1.3111790286915492
1.9840614383516368
-0.5724854751402746
Episode:  0
Rewards:  9.654955280577996
Max reward so far:  9.654955280577996
Mean Reward 9.654955280577996
0.5813449853303609
1.5395647895580624
2.4959128877686454
3.127055888414907
1.081868746454711
0.7640512570753344
1.472912759789324
2.631607375678141
2.776747016375884
0.11794682681793328
0.5217311218584655
1.949780413490953
1.9476977310085206
2.319511760426394
2.111231583297194
-0.13590055102540638
0.7558175072394078
1.73922238550731
1.5605457145400579
0.9126753774093231
1.4557674299387147
1.3280305730033435
1.8267595057637664
0.9747702479638976
1.8451959849626294
-0.1550163237407105
0.6435031656044884
1.9990116997010774
2.0290857786298147
2.1727690514569984
1.609608106709493
0.9061366542257019
0.3408025163560524
0.3998652473397669
1.5346157709456747
1.692843637164333
1.7074

AssertionError: 

In [None]:
plt.plot(G)
plt.ylabel('Returns')
plt.xlabel('Number of episodes')

In [None]:
plt.plot(mean_G_all)
plt.ylabel('Average of Returns ')
plt.xlabel('Number of episodes')