In [39]:
import random
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from collections import deque
from skimage import transform
from vizdoom import *

warnings.filterwarnings('ignore')
'''
My endeavours into reinforcement learning based on the fantastic course by Thomas Simonini: https://simoninithomas.github.io/Deep_reinforcement_learning_Course/
'''

In [40]:
def create_environment():
    game = DoomGame()    
    game.load_config("C:/Python36/Lib/site-packages/vizdoom/scenarios/basic.cfg")
    game.set_doom_scenario_path("C:/Python36/Lib/site-packages/vizdoom/scenarios/basic.wad")
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

def test_environment():
    game = DoomGame()
    game.load_config("C:/Python36/Lib/site-packages/vizdoom/scenarios/basic.cfg")
    game.set_doom_scenario_path("C:/Python36/Lib/site-packages/vizdoom/scenarios/basic.wad")
    game.init()
    
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]
    
    episodes = 10
    for episode in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            reward = game.make_action(action)
            print(action, reward)
            time.sleep(0.02)
        print("Result: ", game.get_total_reward())
        time.sleep(2)
    game.close()
    
def preprocess_frame(frame):
    if frame.shape[0] == 3:
        frame = np.mean(frame, axis=0)
    cropped_frame = frame[:, 30:-30]
    normalized_frame = np.divide(cropped_frame, 255)
    
    return transform.resize(normalized_frame, [84, 84])

game, possible_actions = create_environment()

In [41]:
stack_size = 4 
# Initialize deque
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)    
    if is_new_episode:
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        for i in range(4):
            stacked_frames.append(frame)

        stacked_state = np.stack(stacked_frames, axis=2)        
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

In [42]:
# Hyperparameters
state_size = [84, 84, 4]
action_size = game.get_available_buttons_size()
learning_rate = 0.0005

total_episodes = 200
max_steps = 100
batch_size = 64
explore_start = 0.8
explore_stop = 0.01
decay_rate = 0.001
gamma = 0.95

pretrain_length = batch_size
memory_size = 1000000

training = True
episode_render = False

In [43]:
class DeepQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DeepQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name="actions")
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            # First convolution: CNN (Batch Normalized with ELU)
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_, filters=32, kernel_size=[8, 8],
                                             strides=[4, 4], padding="VALID", 
                                             kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name="conv1")
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1, training=True,
                                                                epsilon=1e-5, name='batch_norm1')
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            # Second convolution
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out, filters=64, kernel_size=[4, 4], strides=[2,2],
                                             padding="VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name="conv2")
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2, training=True,
                                                                epsilon=1e-5, name="batch_norm2")
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            # Third convolution
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out, filters=128, kernel_size=[4, 4], strides=[2, 2],
                                             padding="VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name="conv3")
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3, training=True,
                                                                epsilon=1e-5, name="batch_norm3")
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            # Convolution output to feature vector
            self.flatten = tf.layers.flatten(self.conv3_out)
            #
            self.fc1 = tf.layers.dense(inputs=self.flatten, units=512, activation=tf.nn.elu,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                     name="fc1")       
            self.fc2 = tf.layers.dense(inputs=self.fc1, units=128, activation=tf.nn.elu,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc2")
            self.output = tf.layers.dense(inputs=self.fc2, kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                             units=3, activation=None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)         

In [44]:
tf.reset_default_graph()
DeepQNetwork = DeepQNetwork(state_size, action_size, learning_rate)

In [45]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False)
        
        return [self.buffer[i] for i in index]

In [46]:
# Initialize empty memory
game.init()
memory = Memory(max_size=memory_size)
game.new_episode()

for i in range(pretrain_length):
    if i == 0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    action = random.choice(possible_actions)
    reward = game.make_action(action)
    done = game.is_episode_finished()
    
    if done:
        next_state = np.zeros(state.shape)
        memory.add((state, action, reward, next_state, done))
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        memory.add((state, action, reward, next_state, done))
        state = next_state
        

In [47]:
writer = tf.summary.FileWriter("/tensorboard/dqn/1")
tf.summary.scalar("Loss", DeepQNetwork.loss)
write_op = tf.summary.merge_all()

In [48]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        action = random.choice(possible_actions)
    else:
        Qs = sess.run(DeepQNetwork.output, feed_dict={DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

saver = tf.train.Saver()

In [49]:
if training == True:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        for episode in range(total_episodes):
            start_time = time.time()
            step = 0
            episode_rewards = []
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                decay_step += 1
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate,
                                                                decay_step, state, possible_actions)
                reward = game.make_action(action)
                done = game.is_episode_finished()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((84, 84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    step = max_steps
                    total_reward = np.sum(episode_rewards)
                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability))
                    memory.add((state, action, reward, next_state, done))
                    print(time.time() - start_time)
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch])
                dones_mb = np.array([each[4] for each in batch])
                target_Qs_batch = []
                Qs_next_state = sess.run(DeepQNetwork.output, feed_dict={DeepQNetwork.inputs_: next_states_mb})
                
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else: 
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                loss, _ = sess.run([DeepQNetwork.loss, DeepQNetwork.optimizer],
                                      feed_dict={DeepQNetwork.inputs_: states_mb,
                                                 DeepQNetwork.target_Q: targets_mb,
                                                 DeepQNetwork.actions_: actions_mb})
                summary = sess.run(write_op, feed_dict={DeepQNetwork.inputs_: states_mb,
                                                        DeepQNetwork.target_Q: targets_mb,
                                                        DeepQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")

Episode: 0 Total reward: 24.0 Training loss: 235.0504 Explore P: 0.7525
23.14828586578369
Episode: 1 Total reward: 95.0 Training loss: 98.0138 Explore P: 0.7481
1.8131740093231201
Episode: 4 Total reward: 95.0 Training loss: 176.9392 Explore P: 0.6107
1.8400049209594727
Episode: 6 Total reward: 95.0 Training loss: 49.1921 Explore P: 0.5503
2.2429442405700684
Episode: 10 Total reward: 94.0 Training loss: 23.6845 Explore P: 0.4074
2.229400396347046
Episode: 12 Total reward: 95.0 Training loss: 1.7553 Explore P: 0.3675
2.0055696964263916
Episode: 14 Total reward: 92.0 Training loss: 4.1190 Explore P: 0.3305
2.9945971965789795
Episode: 16 Total reward: 66.0 Training loss: 7.4321 Explore P: 0.2915
10.633217811584473
Episode: 17 Total reward: 49.0 Training loss: 0.9055 Explore P: 0.2799
15.599941968917847
Episode: 18 Total reward: 95.0 Training loss: 1.7241 Explore P: 0.2783
1.8934600353240967
Episode: 20 Total reward: 92.0 Training loss: 4.4382 Explore P: 0.2506
3.204519748687744
Episode: 2

In [55]:
scores = []
with tf.Session() as sess:
    game, possible_actions = create_environment()
    total_score = 0
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    for i in range(100):
        done = False
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        while not game.is_episode_finished():
            Qs = sess.run(DeepQNetwork.output, feed_dict={DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:
                break
            else:
                #print("else")
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
        score = game.get_total_reward()
        scores.append(score)
        print("Score: ", score)

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Score:  95.0
Score:  95.0
Score:  87.0
Score:  83.0
Score:  74.0
Score:  78.0
Score:  95.0
Score:  95.0
Score:  81.0
Score:  66.0
Score:  55.0
Score:  85.0
Score:  95.0
Score:  95.0
Score:  55.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  79.0
Score:  94.0
Score:  83.0
Score:  87.0
Score:  56.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  74.0
Score:  70.0
Score:  59.0
Score:  90.0
Score:  65.0
Score:  66.0
Score:  95.0
Score:  90.0
Score:  95.0
Score:  73.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  70.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  60.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  88.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  90.0
Score:  95.0
Score:  63.0
Score:  95.0
Score:  92.0
Score:  60.0
Score:  95.0
Score:  95.0
Score:  50.0
Score:  61.0
Score:  70.0
Score:  73.0
Score:  95.0
Score:  60.0
Score:  74.0
Score:  95.0
Score:  88.0
Score:  95.0
Score:  90.0
Sc

In [51]:
print(np.mean(scores))

85.65


In [56]:
game.close()