In [3]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque

# Create the Cart-Pole game environment
env = gym.make('CartPole-v0')


class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=4,
                 action_size=2, hidden_size=10):
        # state inputs to the Q-network
        self.model = Sequential()

        self.model.add(Dense(hidden_size, activation='relu',
                             input_dim=state_size))
        self.model.add(Dense(hidden_size, activation='relu'))
        self.model.add(Dense(action_size, activation='linear'))

        self.optimizer = Adam(lr=learning_rate)
        self.model.compile(loss='mse', optimizer=self.optimizer)


class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)),
                               size=batch_size,
                               replace=False)
        return [self.buffer[ii] for ii in idx]


train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16               # number of units in each Q-network hidden layer
learning_rate = 0.001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 32                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

mainQN = QNetwork(hidden_size=hidden_size, learning_rate=learning_rate)

###################################
## Populate the experience memory
###################################

# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, 4])

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))

        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
        state = np.reshape(state, [1, 4])
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

#############
## Training
#############
step = 0
for ep in range(1, train_episodes):
    total_reward = 0
    t = 0
    while t < max_steps:
        step += 1
        # Uncomment this next line to watch the training
        # env.render()

        # Explore or Exploit
        explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step)
        if explore_p > np.random.rand():
            # Make a random action
            action = env.action_space.sample()
        else:
            # Get action from Q-network
            Qs = mainQN.model.predict(state)[0]
            action = np.argmax(Qs)

        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        total_reward += reward

        if done:
            # the episode ends so no next state
            next_state = np.zeros(state.shape)
            t = max_steps

            print('Episode: {}'.format(ep),
                  'Total reward: {}'.format(total_reward),
                  'Explore P: {:.4f}'.format(explore_p))

            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())
            state = np.reshape(state, [1, 4])
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
            t += 1

        # Replay
        inputs = np.zeros((batch_size, 4))
        targets = np.zeros((batch_size, 2))

        minibatch = memory.sample(batch_size)
        for i, (state_b, action_b, reward_b, next_state_b) in enumerate(minibatch):
            inputs[i:i+1] = state_b
            target = reward_b
            if not (next_state_b == np.zeros(state_b.shape)).all(axis=1):
                target_Q = mainQN.model.predict(next_state_b)[0]
                target = reward_b + gamma * np.amax(mainQN.model.predict(next_state_b)[0])
            targets[i] = mainQN.model.predict(state_b)
            targets[i][action_b] = target
        mainQN.model.fit(inputs, targets, epochs=1, verbose=0)

Using TensorFlow backend.


Episode: 1 Total reward: 14.0 Explore P: 0.9986
Episode: 2 Total reward: 28.0 Explore P: 0.9959
Episode: 3 Total reward: 19.0 Explore P: 0.9940
Episode: 4 Total reward: 17.0 Explore P: 0.9923
Episode: 5 Total reward: 22.0 Explore P: 0.9901
Episode: 6 Total reward: 18.0 Explore P: 0.9884
Episode: 7 Total reward: 15.0 Explore P: 0.9869
Episode: 8 Total reward: 12.0 Explore P: 0.9857
Episode: 9 Total reward: 33.0 Explore P: 0.9825
Episode: 10 Total reward: 15.0 Explore P: 0.9811
Episode: 11 Total reward: 16.0 Explore P: 0.9795
Episode: 12 Total reward: 11.0 Explore P: 0.9785
Episode: 13 Total reward: 19.0 Explore P: 0.9766
Episode: 14 Total reward: 17.0 Explore P: 0.9750
Episode: 15 Total reward: 10.0 Explore P: 0.9740
Episode: 16 Total reward: 22.0 Explore P: 0.9719
Episode: 17 Total reward: 15.0 Explore P: 0.9705
Episode: 18 Total reward: 9.0 Explore P: 0.9696
Episode: 19 Total reward: 17.0 Explore P: 0.9680
Episode: 20 Total reward: 27.0 Explore P: 0.9654
Episode: 21 Total reward: 18.0

Episode: 168 Total reward: 13.0 Explore P: 0.7370
Episode: 169 Total reward: 20.0 Explore P: 0.7355
Episode: 170 Total reward: 18.0 Explore P: 0.7342
Episode: 171 Total reward: 12.0 Explore P: 0.7334
Episode: 172 Total reward: 29.0 Explore P: 0.7313
Episode: 173 Total reward: 19.0 Explore P: 0.7299
Episode: 174 Total reward: 37.0 Explore P: 0.7272
Episode: 175 Total reward: 20.0 Explore P: 0.7258
Episode: 176 Total reward: 43.0 Explore P: 0.7227
Episode: 177 Total reward: 16.0 Explore P: 0.7216
Episode: 178 Total reward: 11.0 Explore P: 0.7208
Episode: 179 Total reward: 12.0 Explore P: 0.7200
Episode: 180 Total reward: 12.0 Explore P: 0.7191
Episode: 181 Total reward: 18.0 Explore P: 0.7178
Episode: 182 Total reward: 17.0 Explore P: 0.7166
Episode: 183 Total reward: 23.0 Explore P: 0.7150
Episode: 184 Total reward: 11.0 Explore P: 0.7142
Episode: 185 Total reward: 12.0 Explore P: 0.7134
Episode: 186 Total reward: 17.0 Explore P: 0.7122
Episode: 187 Total reward: 12.0 Explore P: 0.7113


Episode: 333 Total reward: 11.0 Explore P: 0.5652
Episode: 334 Total reward: 14.0 Explore P: 0.5645
Episode: 335 Total reward: 14.0 Explore P: 0.5637
Episode: 336 Total reward: 11.0 Explore P: 0.5631
Episode: 337 Total reward: 22.0 Explore P: 0.5619
Episode: 338 Total reward: 15.0 Explore P: 0.5610
Episode: 339 Total reward: 29.0 Explore P: 0.5594
Episode: 340 Total reward: 14.0 Explore P: 0.5587
Episode: 341 Total reward: 22.0 Explore P: 0.5575
Episode: 342 Total reward: 19.0 Explore P: 0.5564
Episode: 343 Total reward: 26.0 Explore P: 0.5550
Episode: 344 Total reward: 15.0 Explore P: 0.5542
Episode: 345 Total reward: 22.0 Explore P: 0.5530
Episode: 346 Total reward: 34.0 Explore P: 0.5512
Episode: 347 Total reward: 12.0 Explore P: 0.5505
Episode: 348 Total reward: 13.0 Explore P: 0.5498
Episode: 349 Total reward: 15.0 Explore P: 0.5490
Episode: 350 Total reward: 18.0 Explore P: 0.5480
Episode: 351 Total reward: 21.0 Explore P: 0.5469
Episode: 352 Total reward: 14.0 Explore P: 0.5461


Episode: 497 Total reward: 9.0 Explore P: 0.2334
Episode: 498 Total reward: 17.0 Explore P: 0.2330
Episode: 499 Total reward: 15.0 Explore P: 0.2327
Episode: 500 Total reward: 13.0 Explore P: 0.2324
Episode: 501 Total reward: 12.0 Explore P: 0.2321
Episode: 502 Total reward: 15.0 Explore P: 0.2318
Episode: 503 Total reward: 11.0 Explore P: 0.2316
Episode: 504 Total reward: 12.0 Explore P: 0.2313
Episode: 505 Total reward: 10.0 Explore P: 0.2311
Episode: 506 Total reward: 17.0 Explore P: 0.2307
Episode: 507 Total reward: 13.0 Explore P: 0.2304
Episode: 508 Total reward: 8.0 Explore P: 0.2302
Episode: 509 Total reward: 12.0 Explore P: 0.2300
Episode: 510 Total reward: 17.0 Explore P: 0.2296
Episode: 511 Total reward: 12.0 Explore P: 0.2293
Episode: 512 Total reward: 10.0 Explore P: 0.2291
Episode: 513 Total reward: 116.0 Explore P: 0.2266
Episode: 514 Total reward: 157.0 Explore P: 0.2232
Episode: 515 Total reward: 199.0 Explore P: 0.2190
Episode: 516 Total reward: 199.0 Explore P: 0.214

Episode: 660 Total reward: 8.0 Explore P: 0.0590
Episode: 661 Total reward: 9.0 Explore P: 0.0589
Episode: 662 Total reward: 10.0 Explore P: 0.0589
Episode: 663 Total reward: 9.0 Explore P: 0.0588
Episode: 664 Total reward: 9.0 Explore P: 0.0588
Episode: 665 Total reward: 7.0 Explore P: 0.0588
Episode: 666 Total reward: 10.0 Explore P: 0.0587
Episode: 667 Total reward: 9.0 Explore P: 0.0587
Episode: 668 Total reward: 7.0 Explore P: 0.0586
Episode: 669 Total reward: 10.0 Explore P: 0.0586
Episode: 670 Total reward: 9.0 Explore P: 0.0585
Episode: 671 Total reward: 11.0 Explore P: 0.0585
Episode: 672 Total reward: 11.0 Explore P: 0.0584
Episode: 673 Total reward: 10.0 Explore P: 0.0584
Episode: 674 Total reward: 10.0 Explore P: 0.0583
Episode: 675 Total reward: 11.0 Explore P: 0.0583
Episode: 676 Total reward: 11.0 Explore P: 0.0582
Episode: 677 Total reward: 8.0 Explore P: 0.0582
Episode: 678 Total reward: 7.0 Explore P: 0.0582
Episode: 679 Total reward: 11.0 Explore P: 0.0581
Episode: 6

Episode: 825 Total reward: 40.0 Explore P: 0.0318
Episode: 826 Total reward: 40.0 Explore P: 0.0317
Episode: 827 Total reward: 46.0 Explore P: 0.0316
Episode: 828 Total reward: 32.0 Explore P: 0.0315
Episode: 829 Total reward: 33.0 Explore P: 0.0315
Episode: 830 Total reward: 51.0 Explore P: 0.0314
Episode: 831 Total reward: 37.0 Explore P: 0.0313
Episode: 832 Total reward: 35.0 Explore P: 0.0312
Episode: 833 Total reward: 61.0 Explore P: 0.0311
Episode: 834 Total reward: 40.0 Explore P: 0.0310
Episode: 835 Total reward: 43.0 Explore P: 0.0309
Episode: 836 Total reward: 43.0 Explore P: 0.0308
Episode: 837 Total reward: 39.0 Explore P: 0.0307
Episode: 838 Total reward: 37.0 Explore P: 0.0306
Episode: 839 Total reward: 47.0 Explore P: 0.0306
Episode: 840 Total reward: 39.0 Explore P: 0.0305
Episode: 841 Total reward: 35.0 Explore P: 0.0304
Episode: 842 Total reward: 39.0 Explore P: 0.0303
Episode: 843 Total reward: 31.0 Explore P: 0.0303
Episode: 844 Total reward: 37.0 Explore P: 0.0302


Episode: 989 Total reward: 62.0 Explore P: 0.0182
Episode: 990 Total reward: 60.0 Explore P: 0.0181
Episode: 991 Total reward: 48.0 Explore P: 0.0181
Episode: 992 Total reward: 9.0 Explore P: 0.0181
Episode: 993 Total reward: 10.0 Explore P: 0.0181
Episode: 994 Total reward: 72.0 Explore P: 0.0180
Episode: 995 Total reward: 73.0 Explore P: 0.0180
Episode: 996 Total reward: 56.0 Explore P: 0.0179
Episode: 997 Total reward: 30.0 Explore P: 0.0179
Episode: 998 Total reward: 46.0 Explore P: 0.0179
Episode: 999 Total reward: 12.0 Explore P: 0.0178
