In [1]:
import tensorflow as tf      
import numpy as np
import random                # Handling random number generation
from random import randrange
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs

from agents.KSPDeepEngine import *




In [2]:
def create_environment():
    game = KSPDeepEngine()
        
    noop = KSPAction()
        
    roll_left = KSPAction()
    roll_left.flightCtrlState.roll = -1.0

    roll_right = KSPAction()
    roll_right.flightCtrlState.roll = 1.0

    roll_straight = KSPAction()
    roll_straight.flightCtrlState.roll = 0.0
    
    yaw_left = KSPAction()
    yaw_left.flightCtrlState.yaw = -1.0

    yaw_right = KSPAction()
    yaw_right.flightCtrlState.yaw = 1.0

    yaw_straight = KSPAction()
    yaw_straight.flightCtrlState.yaw = 0.0
    
    pitch_left = KSPAction()
    pitch_left.flightCtrlState.pitch = -1.0

    pitch_right = KSPAction()
    pitch_right.flightCtrlState.pitch = 1.0

    pitch_straight = KSPAction()
    pitch_straight.flightCtrlState.pitch = 0.0
    
    throttle_0 = KSPAction()
    throttle_0.flightCtrlState.mainThrottle = 0.0
    
    throttle_10 = KSPAction()
    throttle_10.flightCtrlState.mainThrottle = 0.1
    
    throttle_20 = KSPAction()
    throttle_20.flightCtrlState.mainThrottle = 0.2
    
    throttle_30 = KSPAction()
    throttle_30.flightCtrlState.mainThrottle = 0.3
    
    throttle_40 = KSPAction()
    throttle_40.flightCtrlState.mainThrottle = 0.4
    
    throttle_50 = KSPAction()
    throttle_50.flightCtrlState.mainThrottle = 0.5
    
    throttle_60 = KSPAction()
    throttle_60.flightCtrlState.mainThrottle = 0.6
    
    throttle_70 = KSPAction()
    throttle_70.flightCtrlState.mainThrottle = 0.7
    
    throttle_80 = KSPAction()
    throttle_80.flightCtrlState.mainThrottle = 0.8
    
    throttle_90 = KSPAction()
    throttle_90.flightCtrlState.mainThrottle = 0.9
    
    throttle_100 = KSPAction()
    throttle_100.flightCtrlState.mainThrottle = 1.0
        
    stage = KSPAction()
    stage.action = 1
    
    action_map = [
        'noop', 'roll', 'roll', 'yaw', 'yaw', 'pitch', 'pitch'
    ]
    
    possible_actions = [
        noop, roll_left, roll_right, yaw_left, yaw_right, pitch_left, pitch_right
    ]
    
    return game, possible_actions, action_map

def set_action(flightCtrl, action, action_map):
    a = KSPAction()
    
    if action_map == 'roll':
        flightCtrl.roll = action.flightCtrlState.roll
        
    if action_map == 'pitch':
        flightCtrl.pitch = action.flightCtrlState.pitch
        
    if action_map == 'yaw':
        flightCtrl.yaw = action.flightCtrlState.yaw
        
#     if action_map == 'mainThrottle':
#         flightCtrl.mainThrottle = action.flightCtrlState.mainThrottle
        
    if action_map == 'stage':
        a.action = 1
        
    if action_map != 'noop':
        a.flightCtrlState = flightCtrl
#     else:
#         a.flightCtrlState.mainThrottle = flightCtrl.mainThrottle
    
    return a

In [3]:
def test_environment():
    game, possible_actions, action_map = create_environment()
    episodes = 10
    for i in range(episodes):
        game.new_episode()
        index = randrange(len(possible_actions))
        action = possible_actions[index]
        total_reward = 0
    
        state, reward, done, vessel, flightctrl = game.get_state(action)
        #print(state, reward, done)

        while not done:
            time.sleep(0.02)
            index = randrange(len(possible_actions))
            action = possible_actions[index]
            action_key = action_map[index]
            
            #fc = FlightCtrl(**flightctrl)
            action = set_action(flightctrl, action, action_key)
            state, reward, done, vessel, flightctrl = game.get_state(action)
            print(state.shape)

            total_reward = reward
        print ("Episode: ", str(i), total_reward)
        time.sleep(2)
    
#test_environment()



    

In [4]:
def preprocess_frame(frame):
    # Greyscale frame already done in our vizdoom config
    # x = np.mean(frame,-1)
    
    # Crop the screen (remove the roof because it contains no information)
    #cropped_frame = frame[30:-10,30:-30]
    
    # Normalize Pixel Values
    #normalized_frame = cropped_frame/255.0
    # Resize
    #preprocessed_frame = transform.resize(frame, [84,84])
    return frame

In [5]:
stack_size = 1 # We stack 4 frames

stacked_frames  =  deque([np.zeros((21), dtype=np.int) for i in range(stack_size)], maxlen=stack_size) 

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((21), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Because we're in a new episode, copy the same frame 4x
        for i in range(stack_size):    
            stacked_frames.append(frame)
        
        #print(stacked_frames.shape)
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=1)
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=1) 
    
    return stacked_state, stacked_frames

In [6]:
game, possible_actions, action_map = create_environment()

### MODEL HYPERPARAMETERS
state_size = [21, 1]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
action_size = len(possible_actions) # 3 possible actions: left, right, shoot
action_space = np.identity(action_size)

learning_rate =  0.0001     # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 100        # Total episodes for training
max_steps = 10000              # Max possible steps in an episode
batch_size = 64
action_delay = 0.5          #How long to wait to check next_action

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.10            # minimum exploration probability 
decay_rate = 0.001            # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.99               # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000         # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False



In [7]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
#             """
#             First convnet:
#             CNN
#             BatchNormalization
#             ELU
#             """
#             # Input is 84x84x4
#             self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
#                                          filters = 32,
#                                          kernel_size = [8,8],
#                                          strides = [4,4],
#                                          padding = "VALID",
#                                           kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
#                                          name = "conv1")
            
#             self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
#                                                    training = True,
#                                                    epsilon = 1e-5,
#                                                      name = 'batch_norm1')
            
#             self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
# #             ## --> [20, 20, 32]
            
            
# #             """
# #             Second convnet:
# #             CNN
# #             BatchNormalization
# #             ELU
# #             """
#             self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
#                                  filters = 64,
#                                  kernel_size = [4,4],
#                                  strides = [2,2],
#                                  padding = "VALID",
#                                 kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
#                                  name = "conv2")
        
#             self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
#                                                    training = True,
#                                                    epsilon = 1e-5,
#                                                      name = 'batch_norm2')

#             self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
# #             ## --> [9, 9, 64]
            
            
# #             """
# #             Third convnet:
# #             CNN
# #             BatchNormalization
# #             ELU
# #             """
#             self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
#                                  filters = 128,
#                                  kernel_size = [4,4],
#                                  strides = [2,2],
#                                  padding = "VALID",
#                                 kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
#                                  name = "conv3")
        
#             self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
#                                                    training = True,
#                                                    epsilon = 1e-5,
#                                                      name = 'batch_norm3')

#             self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
#             ## --> [3, 3, 128]
            
            
            self.flatten = tf.layers.flatten(self.inputs_)
            self.norm = tf.layers.batch_normalization(self.flatten, training=True, epsilon= 1e-5, name='batch_norm')
            ## --> [1152]
            
            
            self.fc = tf.layers.dense(inputs = self.norm,
                                  units = 256,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
#             self.fc2 = tf.layers.dense(inputs = self.fc,
#                                   units = 256,
#                                   activation = tf.nn.elu,
#                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
#                                 name="fc2")
            
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = action_size, 
                                        activation=None)

  
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
                        
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [8]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [9]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [10]:
game, possible_actions, action_map = create_environment()

# Instantiate memory
memory = Memory(max_size = memory_size)

# Render the environment
game.new_episode()
state, reward, done, vessel, flightctrl = game.get_state()
state, stacked_frames = stack_frames(stacked_frames, state, True)

print('Start Pretrain')

for i in range(pretrain_length):
    # Random action
    action_range = len(possible_actions)
        
    index = randrange(action_range)
    
    action = possible_actions[index]
    action_key = action_map[index]
    action = set_action(flightctrl, action, action_key)
     
    state, reward, done, vessel, flightctrl = game.get_state(action)
    state, stacked_frames = stack_frames(stacked_frames, state, False)

    time.sleep(action_delay)
        
    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros((21), dtype=np.int)
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action_space[index], reward, next_state, done))

        # First we need a state
        #state, reward, done, vessel, flightctrl = game.get_state()
        
        # Stack the frames    
        # Start a new episode
        game.new_episode()
        state, reward, done, vessel, flightctrl = game.get_state()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        # Get the next state
        next_state, reward, done, vessel, flightctrl = game.get_state(action)
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action_space[index], reward, next_state, done))
        
        # Our state is now the next_state
        state = next_state


Start Pretrain


In [11]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("tensorboard/dqn/38")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [12]:


"""
This function will do the part
With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions, step):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand() - 0.1

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    action_range = len(possible_actions)

    
    index = randrange(action_range)
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        #print('make random action', index)
            
        action = possible_actions[index]
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        index = int(choice)
                
    return action, explore_probability, index



In [13]:
game, possible_actions, action_map = create_environment()

# Saver will help us to save our model
saver = tf.train.Saver()


if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0

        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            episode_max_height = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            game.new_episode()
            # Make a new episode and observe the first state
            state, reward, done, vessel, flightctrl = game.get_state()
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)

            while not done:
                step += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability, index = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions, game.step)

                
                # Do the action
                action = set_action(flightctrl, action, action_map[index])
                state, reward, done, vessel, flightctrl = game.get_state(action)
                state, stacked_frames = stack_frames(stacked_frames, state, False)
                if vessel['altitude'] > episode_max_height:
                    episode_max_height = vessel['altitude']
                time.sleep(action_delay)
                # Add the reward to total reward

                # If the game is finished
                if done:
                    # the episode ends so no next state
                    next_state = np.zeros((21), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Max Altitude: {:.4f}'.format(episode_max_height),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability),
                              'Action: {}'.format(action_map[index]))

                    episode_rewards.append(reward)
                    memory.add((state, action_space[index], reward, next_state, done))
                    #game.new_episode()
                else:
                    # Get the next state
                    next_state, reward, _, _, _ = game.get_state(action)
                    episode_rewards.append(reward)
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    

                    # Add experience to memory
                    memory.add((preprocess_frame(state), action_space[index], reward, next_state, done))
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                #print(actions_mb)
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
            
                target_Qs_batch = []

                 # Get Q values for next_state 
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])
                

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Max Altitude: 140.2988 Training loss: 15.5488 Explore P: 0.9848 Action: pitch
Model Saved
Episode: 1 Max Altitude: 310.0107 Training loss: 16.5953 Explore P: 0.9561 Action: pitch
Episode: 2 Max Altitude: 475.2083 Training loss: 17.0906 Explore P: 0.9184 Action: roll
Episode: 3 Max Altitude: 304.7653 Training loss: 18.4863 Explore P: 0.8903 Action: roll
Episode: 4 Max Altitude: 1416.5353 Training loss: 23.1399 Explore P: 0.8324 Action: yaw
Episode: 5 Max Altitude: 414.1123 Training loss: 18.5656 Explore P: 0.8023 Action: pitch
Model Saved
Episode: 6 Max Altitude: 235.9908 Training loss: 23.1058 Explore P: 0.7822 Action: pitch
Episode: 7 Max Altitude: 4779.7721 Training loss: 115.0331 Explore P: 0.6896 Action: roll
Episode: 8 Max Altitude: 413.8457 Training loss: 74.5002 Explore P: 0.6653 Action: yaw
Episode: 9 Max Altitude: 792.8213 Training loss: 78.5893 Explore P: 0.6340 Action: yaw
Episode: 10 Max Altitude: 151.2394 Training loss: 72.1549 Explore P: 0.6234 Action: yaw
Mode

KeyboardInterrupt: 

In [15]:
with tf.Session() as sess:
    
    #game, possible_actions = create_environment()    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    for i in range(1):
        
        done = False
        
        game.new_episode()
        
        state, reward, done, vessel, flightctrl = game.get_state()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
                        
        
        while not done:
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            state, reward, done, vessel, flightctrl = game.get_state(action)
            time.sleep(action_delay)
            if done:
                break  
                
            else:
                next_state, _, _, _, _ = game.get_state(action)          
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
                
        score = game.step
        print("Score: ", score)


INFO:tensorflow:Restoring parameters from ./models/model.ckpt


KeyboardInterrupt: 