In [1]:
# dependencies
import tensorflow as tf
import numpy as np
from DQN import DQN
from Experience import ExperienceBuffer
from Game import GameEnv

In [2]:
# opts
load_model = False
checkpoints_path = 'checkpoints/'
save_period = 500
info_period = 10

In [3]:
# hyperparameters
batch_size = 50
gamma = 0.9
epsilon = 0.1
num_episodes = 10000
pre_train = 1000
max_ep_len = 1000000

In [4]:
# create game env, model and exp buff
env = GameEnv(2560)
dqn = DQN()
experience_buffer = ExperienceBuffer(100000)

In [5]:
# training
total_steps = 0
init = tf.global_variables_initializer()
saver = tf.train.Saver()
path = tf.train.latest_checkpoint(checkpoints_path)
rewards = []

with tf.Session() as sess:
    try:
        if load_model:
            saver.restore(sess, path)
        else:
            sess.run(init)

        for i in range(num_episodes):
            episode_buffer = ExperienceBuffer(100000)
            episode_reward = 0.0

            s, r, d = env.reset(i==0)

            for j in range(max_ep_len):
                if np.random.rand(1) < epsilon or total_steps < pre_train:
                    a = np.random.randint(0, 3)
                else:
                    a = sess.run(dqn.predict, feed_dict={dqn.x: [s]})[0]

                s1, r, d = env.step(a)
                episode_buffer.add(np.reshape(np.array([s, a, r, s1, d]), [1,5]))
                episode_reward += r
                total_steps += 1

                if total_steps > pre_train:

                    batch = experience_buffer.batch(batch_size)
                    Q1 = sess.run(tf.reduce_max(dqn.q_vals, axis=1), feed_dict={dqn.x: np.vstack(batch[:, 3])})
                    targetQ = batch[:, 2] + gamma * Q1

                    sess.run(dqn.train, feed_dict={dqn.x: np.vstack(batch[:, 0]), 
                                                   dqn.targetQ: targetQ,
                                                   dqn.actions: batch[:, 1]})

                if d:
                    break
            experience_buffer.add(episode_buffer._buffer)
            rewards.append(episode_reward)

            if i % save_period == 0:
                print 'model saved'
                saver.save(sess, checkpoints_path, global_step=i)

            if i % info_period == 0:
                print '|------------------------|'
                print ('total episodes: {}\n'
                       'total steps: {}\n'
                       'last {} episodes average reward: {}'.format(i+1, total_steps, info_period, 
                                                                    np.mean(rewards[-info_period:])))
                print '|------------------------|'
    except KeyboardInterrupt:
        print 'Training stoped'

10 seconds before start.
Open game tab in full screen mode!
model saved
|------------------------|
total episodes: 1
total steps: 17
last 10 episodes average reward: 16.0
|------------------------|
|------------------------|
total episodes: 11
total steps: 108
last 10 episodes average reward: 8.1
|------------------------|
|------------------------|
total episodes: 21
total steps: 215
last 10 episodes average reward: 9.7
|------------------------|
|------------------------|
total episodes: 31
total steps: 306
last 10 episodes average reward: 8.1
|------------------------|
|------------------------|
total episodes: 41
total steps: 409
last 10 episodes average reward: 9.3
|------------------------|
|------------------------|
total episodes: 51
total steps: 499
last 10 episodes average reward: 8.0
|------------------------|
|------------------------|
total episodes: 61
total steps: 595
last 10 episodes average reward: 8.6
|------------------------|
Training stoped
