## Agent Design and Neural Model

In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

Using TensorFlow backend.


In [4]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95 #Discount Factor
        #Exploration vs Exploitation Tradeoff
        self.epsilon = 1.0 #100% Random Exploration
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        #Parameter for neural networks
        self.learning_rate = 0.01
        self.model = self._create_model()
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(lr = 0.001))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        #Remember past experience
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        #Sampling according to the Epsilon Greedy method
        if np.random.rand()<=self.epsilon:
            #Take a random action
            return random.randrange(self.action_size)
        #Ask neural network to give me the most suitable action
        return np.argmax(model.predict(state)[0])
    
    def train(self, batch_size=32):
        #Training using a 'Replay Buffer'
        minibatch = random.sample(self.memory, batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
        # X,Y : state, expected reward
        if not done:
            #if game is not yet over, then we use bellman equation to approximate the target value of reward
            target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
        else:
            target = reward
            
        target_f = self.model.predict(state)
        target_f[0][action] = target
        
        # X = state, Y = target_f
        self.model.fit(state, target_f, epochs=1, verbose=0)
        
        # as you're getting more experience, do not trust on randomness
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
    def save(self, name):
        self.model.save_weights(name)

In [5]:
model = Sequential()
model.add(Dense(24, input_dim = 4, activation = 'relu'))
model.add(Dense(24, activation = 'relu'))
model.add(Dense(2, activation = 'linear'))
model.compile(loss = 'mse', optimizer = Adam(lr = 0.001))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [6]:
x = np.random.rand(1, 4) # 1 is the batch size here
model.predict(x)

array([[ 0.00202405, -0.04044954]], dtype=float32)

## Training the DQN Agent (Deep Q-Learner)

In [7]:
n_episodes = 1000
output_dir = "cartpole_model/"

In [8]:
agent = Agent(state_size=4, action_size=2)
done = False
state_size = 4
action_size = 2

In [None]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size]) # 1 is the batch size here
    batch_size = 32
    
    for time in range(500):
        env.render()
        action = agent.act(state) # action is 0 or 1
        next_state, reward, done, other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done) #Experience for the agent
        
        if done:
            print("Game Episode :{}/{} High Score :{} Exploration rate :{:.2}".format(e, 500, time, agent.epsilon))
            break
            
    if len(agent.memory) > batch_size:
        agent.train(batch_size)
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
print("Deep Q-Learner Model Trainder") 
#Although this is a shallow NN, 
#you can train a CNN or a more dense NN 
#depending upon the complexity of the game for which you're taining the network
env.close()

Game Episode :0/500 High Score :13 Exploration rate :1.0
Game Episode :1/500 High Score :39 Exploration rate :1.0
Game Episode :2/500 High Score :21 Exploration rate :0.99
Game Episode :3/500 High Score :18 Exploration rate :0.99
Game Episode :4/500 High Score :17 Exploration rate :0.99
Game Episode :5/500 High Score :17 Exploration rate :0.98
Game Episode :6/500 High Score :11 Exploration rate :0.98
Game Episode :7/500 High Score :16 Exploration rate :0.97
Game Episode :8/500 High Score :21 Exploration rate :0.97
Game Episode :9/500 High Score :70 Exploration rate :0.96
Game Episode :10/500 High Score :27 Exploration rate :0.96
Game Episode :11/500 High Score :60 Exploration rate :0.95
Game Episode :12/500 High Score :11 Exploration rate :0.95
Game Episode :13/500 High Score :11 Exploration rate :0.94
Game Episode :14/500 High Score :11 Exploration rate :0.94
Game Episode :15/500 High Score :42 Exploration rate :0.93
Game Episode :16/500 High Score :43 Exploration rate :0.93
Game Epis

Game Episode :140/500 High Score :12 Exploration rate :0.5
Game Episode :141/500 High Score :8 Exploration rate :0.5
Game Episode :142/500 High Score :18 Exploration rate :0.49
Game Episode :143/500 High Score :9 Exploration rate :0.49
Game Episode :144/500 High Score :12 Exploration rate :0.49
Game Episode :145/500 High Score :8 Exploration rate :0.49
Game Episode :146/500 High Score :12 Exploration rate :0.48
Game Episode :147/500 High Score :11 Exploration rate :0.48
Game Episode :148/500 High Score :39 Exploration rate :0.48
Game Episode :149/500 High Score :12 Exploration rate :0.48
Game Episode :150/500 High Score :13 Exploration rate :0.47
Game Episode :151/500 High Score :13 Exploration rate :0.47
Game Episode :152/500 High Score :9 Exploration rate :0.47
Game Episode :153/500 High Score :10 Exploration rate :0.47
Game Episode :154/500 High Score :9 Exploration rate :0.46
Game Episode :155/500 High Score :8 Exploration rate :0.46
Game Episode :156/500 High Score :15 Exploration

Game Episode :278/500 High Score :10 Exploration rate :0.25
Game Episode :279/500 High Score :8 Exploration rate :0.25
Game Episode :280/500 High Score :7 Exploration rate :0.25
Game Episode :281/500 High Score :13 Exploration rate :0.25
Game Episode :282/500 High Score :15 Exploration rate :0.24
Game Episode :283/500 High Score :8 Exploration rate :0.24
Game Episode :284/500 High Score :9 Exploration rate :0.24
Game Episode :285/500 High Score :10 Exploration rate :0.24
Game Episode :286/500 High Score :7 Exploration rate :0.24
Game Episode :287/500 High Score :8 Exploration rate :0.24
Game Episode :288/500 High Score :8 Exploration rate :0.24
Game Episode :289/500 High Score :11 Exploration rate :0.24
Game Episode :290/500 High Score :12 Exploration rate :0.23
Game Episode :291/500 High Score :10 Exploration rate :0.23
Game Episode :292/500 High Score :9 Exploration rate :0.23
Game Episode :293/500 High Score :7 Exploration rate :0.23
Game Episode :294/500 High Score :8 Exploration r