In [1]:
#
# Mountaincar problem using DQN and seperate target network
#

import gym
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from collections import deque


class DQN:
    def __init__(self, env):
        # Environment to use
        self.env = env
        # Replay memory
        self.memory = deque(maxlen=10000)

        # Discount factor
        self.gamma = 0.99

        # Initial exploration factor
        self.epsilon = 1.0
        # Minimum value exploration factor
        self.epsilon_min = 0.005
        # Decay for epsilon
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000

        self.batch_size = 64
        self.train_start = 1000
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n

        # Learning rate
        self.learning_rate = 0.001

        # Model being trained
        self.model = self.create_model()
        # Target model used to predict Q(S,A)
        self.target_model = self.create_model()

    def create_model(self):
        model = Sequential()
        model.add(Dense(
            32, input_dim=self.state_size, activation='relu', kernel_initializer="he_uniform"))
        model.add(Dense(16, activation='relu', kernel_initializer="he_uniform"))
        model.add(Dense(self.env.action_space.n, activation="linear",
                        kernel_initializer="he_uniform"))
        model.compile(
            loss="mean_squared_error", optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        # Decay exploration rate by epsilon decay
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        if len(self.memory) < self.train_start:
            return

        mini_batch = random.sample(self.memory, self.batch_size)

        update_input = np.zeros((self.batch_size, self.state_size))
        update_target = np.zeros((self.batch_size, self.action_size))

        for i in range(self.batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]

            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * \
                    np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        self.model.fit(update_input, update_target,
                       batch_size=self.batch_size, epochs=1, verbose=0)

    def target_train(self):
        # Simply copy the weights of the model to target_model
        self.target_model.set_weights(self.model.get_weights())
        return

    def save_model(self, fn):
        self.model.save(fn)


def main():
    env = gym.make("MountainCar-v0")

    trials = 4000
    trial_len = 500

    dqn_agent = DQN(env=env)
    for trial in range(trials):
        cur_state = env.reset().reshape(1, 2)
        for step in range(trial_len):
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = env.step(action)

            new_state = new_state.reshape(1, 2)
            dqn_agent.remember(cur_state, action, reward, new_state, done)

            dqn_agent.replay()

            cur_state = new_state
            if done:
                env.reset()
                dqn_agent.target_train()
                break

        print("Iteration: {} Score: -{}".format(trial, step))


if __name__ == "__main__":
    main()


Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Iteration: 0 Score: -199
Iteration: 1 Score: -199
Iteration: 2 Score: -199
Iteration: 3 Score: -199
Iteration: 4 Score: -199
Iteration: 5 Score: -199
Iteration: 6 Score: -199
Iteration: 7 Score: -199
Iteration: 8 Score: -199
Iteration: 9 Score: -199
Iteration: 10 Score: -199
Iteration: 11 Score: -199
Iteration: 12 Score: -199
Iteration: 13 Score: -199
Iteration: 14 Score: -199
Iteration: 15 Score: -199
Iteration: 16 Score: -199
Iteration: 17 Score: -199
Iteration: 18 Score: -199
Iteration: 19 Score: -199
Iteration: 20 Score: -199
Iteration: 21 Score: -199
Iteration: 22 Score: -199
Iteration: 23 Score: -199
Iteration: 24 Score: -199
Iteration: 25 Score: -199
Iteration: 26 Score: -199
Iteration: 27 Score: -199
Iteration: 28 Score: -199
Iteration: 29 Score: -199
Iteration: 30 Score: -199
Iteration: 31 Score: -199
Iteration: 32 Score: -199
Iteration: 33 Score: -199
Iteration: 34 Scor

Iteration: 304 Score: -102
Iteration: 305 Score: -171
Iteration: 306 Score: -199
Iteration: 307 Score: -199
Iteration: 308 Score: -107
Iteration: 309 Score: -94
Iteration: 310 Score: -115
Iteration: 311 Score: -90
Iteration: 312 Score: -157
Iteration: 313 Score: -199
Iteration: 314 Score: -199
Iteration: 315 Score: -199
Iteration: 316 Score: -199
Iteration: 317 Score: -199
Iteration: 318 Score: -105
Iteration: 319 Score: -199
Iteration: 320 Score: -199
Iteration: 321 Score: -87
Iteration: 322 Score: -199
Iteration: 323 Score: -199
Iteration: 324 Score: -199
Iteration: 325 Score: -125
Iteration: 326 Score: -199
Iteration: 327 Score: -199
Iteration: 328 Score: -94
Iteration: 329 Score: -89
Iteration: 330 Score: -162
Iteration: 331 Score: -170
Iteration: 332 Score: -155
Iteration: 333 Score: -153
Iteration: 334 Score: -162
Iteration: 335 Score: -88
Iteration: 336 Score: -199
Iteration: 337 Score: -199
Iteration: 338 Score: -199
Iteration: 339 Score: -199
Iteration: 340 Score: -199
Iterati

Iteration: 611 Score: -199
Iteration: 612 Score: -199
Iteration: 613 Score: -199
Iteration: 614 Score: -151
Iteration: 615 Score: -98
Iteration: 616 Score: -199
Iteration: 617 Score: -87
Iteration: 618 Score: -199
Iteration: 619 Score: -102
Iteration: 620 Score: -199
Iteration: 621 Score: -93
Iteration: 622 Score: -90
Iteration: 623 Score: -178
Iteration: 624 Score: -199
Iteration: 625 Score: -199
Iteration: 626 Score: -84
Iteration: 627 Score: -199
Iteration: 628 Score: -199
Iteration: 629 Score: -199
Iteration: 630 Score: -199
Iteration: 631 Score: -88
Iteration: 632 Score: -161
Iteration: 633 Score: -199
Iteration: 634 Score: -184
Iteration: 635 Score: -199
Iteration: 636 Score: -199
Iteration: 637 Score: -199
Iteration: 638 Score: -158
Iteration: 639 Score: -199
Iteration: 640 Score: -153
Iteration: 641 Score: -199
Iteration: 642 Score: -199
Iteration: 643 Score: -199
Iteration: 644 Score: -199
Iteration: 645 Score: -199
Iteration: 646 Score: -159
Iteration: 647 Score: -90
Iteratio

KeyboardInterrupt: 