In [2]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pdb
import sys
from tensorflow.keras import layers
from collections import deque
import random

In [4]:
#env = gym.make('MountainCar-v0')
env = gym.make('CartPole-v0')
observation_size = env.observation_space.shape[0]
action_size = env.action_space.n
# observation = Box(2)  with 0: position [-1.2,0.6] 1: velocity [-0.07, 0.07]

In [7]:
from tensorflow.keras.optimizers import Adam
class RLAgent():
    def __init__(self,learning_rate,state_size,
                 action_size,hidden_size,epsilon = 0.5, 
                 gamma = 0.7,epsilon_min = 0.01, 
                 epsilon_decay = 0.99,batch_size = 32,
                 decay = 0.01):
        #RL parameters
        self.memory = deque(maxlen = 100000)
        self.state_size = state_size
        self.gamma = gamma
        self.action_size = action_size
        self.batch_size = batch_size
        #epsilon learning param
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.model = tf.keras.models.Sequential()
        self.model.add(layers.Dense(hidden_size, input_dim=state_size, activation = 'tanh'))
        self.model.add(layers.Dense(hidden_size, activation = 'tanh'))
        self.model.add(layers.Dense(action_size,activation = 'linear'))
        self.model.compile(loss = 'mse',
                            optimizer = Adam()) #potential improvement in AdamOpt          
        
        
    def action(self,state):
        if(np.random.random() <= self.epsilon):
          return np.random.randint(self.action_size)

        return np.argmax(self.model.predict(state))
        
                  
    def memorize(self, state, action, reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    def forget(self):
        self.memory.clear()
            
    def learn(self, batch_size = None):
        if batch_size == None:
            batch_size = self.batch_size
        x_batch, y_batch = [],[]
        batch = random.sample(self.memory, min(len(self.memory),batch_size))
        for state, action, reward, next_state, done in batch:
          y_target = self.model.predict(state)
          y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
          x_batch.append(state[0])
          y_batch.append(y_target[0])
        self.model.fit(np.array(x_batch),np.array(y_batch),batch_size=len(x_batch),verbose = 0)
        if self.epsilon > self.epsilon_min:
          self.epsilon *= self.epsilon_decay
        
gamma = 0.99
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [10]:
tf.reset_default_graph()

Agent = RLAgent(1e-5,observation_size,action_size,24)
init = tf.global_variables_initializer()
#
total_episodes = 1500
max_ep = 201 #is set as 200 in Mountain Car, unchangeable.

sess = tf.Session()
sess.run(init)
s = env.reset()
total_reward = []
total_length = []
done_once = False
i = 0
while i < total_episodes:
    s = env.reset()
    running_reward = 0
    d = False
    for j in range(max_ep):
        #Note that the agent assumes batch input of state, so state have to be np array[ [state_values]]
        a = Agent.action(s.reshape(1,-1))                
        s1,r,d,_ = env.step(a)  #new state, reward, done
        Agent.memorize(s.reshape(1,-1),a,r,s1.reshape(1,-1),d)
        running_reward += r
        if  i % 100 == 0:# and (total_episodes - i) < 500:
            env.render()
           # pdb.set_trace()
        s = s1
        if d:
            Agent.learn(j)
            break
    total_reward.append(running_reward)
    Agent.forget()
    if i % 100 == 0:
        print(str(np.mean(total_reward[-100:])) + ' ' + str(i//100) + '/'+str(total_episodes//100) )       
    i += 1
    
env.close()

46.0
41.45
105.68
191.0
192.34
181.86
187.63
188.5
179.98
191.57
180.43
179.74
176.15
185.52
190.77


In [16]:
#final run of learned agent
s = env.reset()
a = Agent.action(s.reshape(1,-1))
s1 , r , d , _ =env.step(a)
total_reward = r
while not d:
    s = s1
    a = Agent.action(s.reshape(1,-1))
    s1, r, d, _ = env.step(a)
    total_reward += r
    env.render()
print("total reward:" + str(total_reward))
env.close()

total reward:200.0


In [None]:

sess.close()