# Challenge 1
Fly as high as you can!

## Objective
Build a network that can launch a satallite rocket and fly it to the highest altitude possible. You can use the notebook below to get started. See if you can modify the training to include staging the rocket so it will go even higher.

## Rules
* You must use the provided ship
* Don't escape Kerbin orbit

Good luck!

In [1]:
# Import Required Libs
import tensorflow as tf      
import numpy as np
import random                
from random import randrange
import time                  

from collections import deque
import matplotlib.pyplot as plt

# Import KSP Client Engine
import sys
from os.path import dirname
sys.path.append(dirname('../../lib'))
sys.path.append(dirname('../../agents'))
from lib.DeepEngine.KSPDeepEngine import *
from agents.BasicDQN import *

### Helper Functions
These function will help us parse and process the game state

In [2]:
INPUTFILTER = {}
INPUTFILTER['currentStage'] = 'int'
INPUTFILTER['velocityD'] = 'vector'
INPUTFILTER['upAxis'] = 'vector'
INPUTFILTER['up'] = 'vector'
INPUTFILTER['north'] = 'vector'
INPUTFILTER['east'] = 'vector'

class Challenge1:    
    def __init__(self):
        # These inital values are set to 89M because that is the start alt of the episode
        self.lastCheckAlt = 89
        self.lastAltitude = 89
        self.game = KSPDeepEngine()
        
    def parseVector3(self, vector3):
        mag = math.sqrt(vector3['x'] * vector3['x'] + vector3['y'] * vector3['y'] + vector3['z'] * vector3['z'])
        x = 0 
        y = 0
        z = 0

        if mag != 0:
            x = vector3['x']/mag
            y = vector3['y']/mag
            z = vector3['z']/mag
        return [x,y,z]

    def parseVessel(self, vessel):
        values = []
        for k in vessel:
            if k in INPUTFILTER:
                if INPUTFILTER[k] == 'int':
                    values.append(vessel[k])
                if INPUTFILTER[k] == 'float':
                    values.append(math.atan(vessel[k]))
                if INPUTFILTER[k] == 'vector':
                    values.extend(self.parseVector3(vessel[k]))            
        return values

    def parseState(self, action, state, game):
        done = False
        vessel = action.vessel
        state.extend(self.parseVessel(vessel))    
        reward = 0
        
        if vessel['altitude'] - self.lastCheckAlt > 10:
            reward = 1
            self.lastCheckAlt = vessel['altitude']
        elif self.lastCheckAlt - vessel['altitude'] > 10:
            reward = -1
            self.lastCheckAlt = vessel['altitude']

        # Episode is over if action is = 3 or vessel drops below 75 M
        if action.action == 3 or int(vessel['altitude']) <= 75: # Action 3 means ship crashed
            reward = -1
            done = True

        if int(vessel['altitude']) != self.lastAltitude:
            game.starttime = time.time()

        self.lastAltitude = int(vessel['altitude'])

        # If no movement is 10 seconds end episode
        if (time.time() - game.starttime) > 10:
            reward = -1
            done = True

        return np.array(state, dtype='f'), reward, done
    
    def create_environment(self):
        game = KSPDeepEngine()

        noop = KSPAction()

        roll_left = KSPAction()
        roll_left.flightCtrlState.roll = -1.0

        roll_right = KSPAction()
        roll_right.flightCtrlState.roll = 1.0

        yaw_left = KSPAction()
        yaw_left.flightCtrlState.yaw = -1.0

        yaw_right = KSPAction()
        yaw_right.flightCtrlState.yaw = 1.0

        pitch_left = KSPAction()
        pitch_left.flightCtrlState.pitch = -1.0

        pitch_right = KSPAction()
        pitch_right.flightCtrlState.pitch = 1.0

        throttle_0 = KSPAction()
        throttle_0.flightCtrlState.mainThrottle = 0.0

        throttle_100 = KSPAction()
        throttle_100.flightCtrlState.mainThrottle = 1.0

        stage = KSPAction()
        stage.action = 1

        action_map = [
            'mainThrottle', 'mainThrottle', 'noop', 'roll', 'roll', 'yaw', 'yaw', 'pitch', 'pitch'
        ]

        possible_actions = [
            throttle_100, throttle_0, noop, roll_left, roll_right, yaw_left, yaw_right,
            pitch_left, pitch_right
        ]

        vessel, state = game.get_state()
        state, reward, done = self.parseState(vessel, state, game)
        print('connected to KSP')

        return game, possible_actions, action_map, state.shape

    def set_action(self, vessel, action, action_map):
        flightCtrl = vessel.flightCtrlState
        a = KSPAction()

        a.flightCtrlState = action.flightCtrlState
        if action_map == 'mainThrottle':
            a.flightCtrlState.mainThrottle = action.flightCtrlState.mainThrottle
        else:
            a.flightCtrlState.mainThrottle = flightCtrl.mainThrottle

        if action_map == 'stage':
            a.action = 1

        return a

    def select_random_action(self):
        index = randrange(len(possible_actions))    
        action = possible_actions[index]
        action_key = action_map[index]
        return action, action_key, index
    
    def new_episode(self):
        game.new_episode()
        self.lastCheckAlt = 89
        self.lastAltitude = 89
        time.sleep(1.5)
        action = KSPAction()
        action.action = KSPAction.STAGING
        self.game.get_state(action)
        time.sleep(1.5)
        action = KSPAction()
        action.action = KSPAction.STAGING        
        self.game.get_state(action)
        self.game.start = time.time()
        
        

## create_environment
This will setup the possible actions you want your network to control. Currently the actions are roll, pitch, and yaw left and right, throttle to 0% or 100%, and a noop action.

### Input Filter
The input filter is a way to select which values will be added to the state array. A full list can be found the docs or in the KSPDeepEngine.py file.


```python
INPUTFILTER = {}  
INPUTFILTER['currentStage'] = 'int'
INPUTFILTER['altitude'] = 'float'
INPUTFILTER['velocityD'] = 'vector'
```

### noop
The noop action does not effect throttle settings. Throttle is a sticky action so once it is set to it will stay that value until you set it again.  Other control actions always reset to 0. Of course you can modify this if you wish.  

In [3]:
challenge1 = Challenge1()
game, possible_actions, action_map, shape = challenge1.create_environment()

connected to KSP


## Set Global Var
Here we will set all the common variables that we can adjust.

In [5]:
### Stack Frame Size
stack_size = 1 # How many frames to feed into the network

### MODEL HYPERPARAMETERS
state_size = [*shape, stack_size]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
action_size = len(possible_actions) # 3 possible actions: left, right, shoot
action_space = np.identity(action_size)

learning_rate =  0.0001     # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 200        # Total episodes for training
max_steps = 10000              # Max possible steps in an episode
batch_size = 64
action_delay = 0.5          #How long to wait to check next_action

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.10            # minimum exploration probability 
decay_rate = 0.001            # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.98               # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000         # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = False
train_number = 2 # Change this for each training

model_path = './modules/model.ckpt'

testing = False


In [5]:
def test_environment():
    episodes = 1
    
    for i in range(episodes):
        game.new_episode()
        index = randrange(len(possible_actions))
        action = possible_actions[index]
        total_reward = 0
    
        vessel, state = game.get_state(action)
        state, reward, done = challenge1.parseState(vessel, state, game)

        while not done:
            time.sleep(0.2)
            index = randrange(len(possible_actions))
            action = possible_actions[index]
            action_key = action_map[index]
            
            action = challenge1.set_action(vessel, action, action_key)
            vessel, state = game.get_state(action)
            state, reward, done = challenge1.parseState(vessel, state, game)

            total_reward = reward
        print ("Episode: ", str(i), "Reward:", total_reward)
        time.sleep(2)

if testing:
    test_environment()


In [6]:
def preprocess_frame(frame):
    #Add any preprocessing to frame here
    return frame

In [7]:
stacked_frames  =  deque([np.zeros((*shape), dtype=np.int) for i in range(stack_size)], maxlen=stack_size) 

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((*shape), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Because we're in a new episode, copy the same frame 4x
        for i in range(stack_size):    
            stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=1)
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=1) 
    
    return stacked_state, stacked_frames

In [8]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

## Pretrain Network Memory
Here we will send random action to the rocket and save the results to memory.


In [9]:
# Instantiate memory
memory = Memory(max_size = memory_size)

# Start New Episode
challenge1.new_episode()
vessel, state = game.get_state()
state, reward, done = challenge1.parseState(vessel, state, game)
state, stacked_frames = stack_frames(stacked_frames, state, True)

print('Start Pretrain')

for i in range(pretrain_length):
    # Select Random Action
    action, action_key, index = challenge1.select_random_action()
    action = challenge1.set_action(vessel, action, action_key)
         
    vessel, state = game.get_state(action)
    state, reward, done = challenge1.parseState(vessel, state, game)
    state, stacked_frames = stack_frames(stacked_frames, state, False)    
    
    time.sleep(action_delay)
        
    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros((*shape), dtype=np.int)
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action_space[index], reward, next_state, done))

        # Start a new episode
        challenge1.new_episode()
               
        vessel, state = game.get_state()
        state, reward, done = challenge1.parseState(vessel, state, game)
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        # Get the next state
        vessel, next_state = game.get_state()
        next_state, reward, done = challenge1.parseState(vessel, next_state, game)
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action_space[index], reward, next_state, done))
        
        # Our state is now the next_state
        state = next_state
print('End Pretrain')

Start Pretrain
End Pretrain


In [9]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the BasicDQN
BasicDQN = BasicDQN(state_size, action_size, learning_rate)



Instructions for updating:
Use keras.layers.flatten instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
"""
This function will do the part
With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand() - 0.1

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    action, _, index = challenge1.select_random_action()

    if (explore_probability <= exp_exp_tradeoff):
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(BasicDQN.output, feed_dict = {BasicDQN.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        #print(choice)
        action = possible_actions[int(choice)]
        index = int(choice)        
                
    return action, explore_probability, index

In [12]:
if training:
    print('Start Training')
    # Setup TensorBoard Writer
    writer = tf.summary.FileWriter("tensorboard/challenge1/" + str(train_number))

    tf.summary.scalar("Loss", BasicDQN.loss)
    write_op = tf.summary.merge_all()
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0

        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            episode_max_height = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            challenge1.new_episode()
            vessel, state = game.get_state()
            state, reward, done = challenge1.parseState(vessel, state, game)
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            while not done:
                step += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability, index = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                
                # Do the action
                action = challenge1.set_action(vessel, action, action_map[index])
                vessel, state = game.get_state(action)
                state, reward, done = challenge1.parseState(vessel, state, game)
                state, stacked_frames = stack_frames(stacked_frames, state, False)

                # Add the reward to total reward
                if vessel.vessel['altitude'] > episode_max_height:
                    episode_max_height = vessel.vessel['altitude']
                time.sleep(action_delay)
                # If the game is finished
                if done:
                    episode_rewards.append(reward)
                    # the episode ends so no next state
                    next_state = np.zeros((*shape), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Max Altitude: {:.4f}'.format(episode_max_height),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability),
                              'Episode Reward: {:.4f}'.format(total_reward))

                    memory.add((state, action_space[index], reward, next_state, done))
                else:
                    # Get the next state
                    vessel, next_state = game.get_state()
                    next_state, reward, _ = challenge1.parseState(vessel, next_state, game)
                    episode_rewards.append(reward)

                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    

                    # Add experience to memory
                    memory.add((preprocess_frame(state), action_space[index], reward, next_state, done))
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
            
                target_Qs_batch = []

                # Get Q values for next_state 
                Qs_next_state = sess.run(BasicDQN.output, feed_dict = {BasicDQN.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])
                

                loss, _ = sess.run([BasicDQN.loss, BasicDQN.optimizer],
                                    feed_dict={BasicDQN.inputs_: states_mb,
                                               BasicDQN.target_Q: targets_mb,
                                               BasicDQN.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={BasicDQN.inputs_: states_mb,
                                                   BasicDQN.target_Q: targets_mb,
                                                   BasicDQN.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, model_path)
                print("Model Saved")

Start Training
Episode: 0 Max Altitude: 112.5891 Training loss: 1.0403 Explore P: 0.9769 Episode Reward: -1.0000
Model Saved
Episode: 1 Max Altitude: 836.9823 Training loss: 0.5363 Explore P: 0.9258 Episode Reward: 11.0000
Episode: 2 Max Altitude: 212.5379 Training loss: 0.6961 Explore P: 0.8958 Episode Reward: 1.0000
Episode: 3 Max Altitude: 796.2223 Training loss: 0.7793 Explore P: 0.8495 Episode Reward: 11.0000
Episode: 4 Max Altitude: 373.7213 Training loss: 0.9906 Explore P: 0.8165 Episode Reward: 6.0000
Episode: 5 Max Altitude: 3417.7838 Training loss: 2.7852 Explore P: 0.7317 Episode Reward: 20.0000
Model Saved
Episode: 6 Max Altitude: 89.4751 Training loss: 1.4646 Explore P: 0.7173 Episode Reward: -2.0000
Episode: 7 Max Altitude: 89.4599 Training loss: 2.0635 Explore P: 0.7045 Episode Reward: -2.0000
Episode: 8 Max Altitude: 509.9095 Training loss: 4.4853 Explore P: 0.6785 Episode Reward: 7.0000
Episode: 9 Max Altitude: 299.8803 Training loss: 0.7453 Explore P: 0.6575 Episode R

In [13]:

print('Load and test model')
saver = tf.train.Saver()
with tf.Session() as sess:

    #game, possible_actions = create_environment()    
    # Load the model
    saver.restore(sess, model_path)
    for i in range(1):
        episode_rewards = []

        done = False

        challenge1.new_episode()
        
        vessel, state = game.get_state()
        state, reward, done = challenge1.parseState(vessel, state, game)
        state, stacked_frames = stack_frames(stacked_frames, state, True)

        while not done:
            # Take the biggest Q value (= the best action)
            Qs = sess.run(BasicDQN.output, feed_dict = {BasicDQN.inputs_: state.reshape((1, *state.shape))})
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            vessel, state = game.get_state(action)
            state, reward, done = challenge1.parseState(vessel, state, game)
        
            time.sleep(action_delay)
            if done:
                break  
            else:
                vessel, next_state = game.get_state()
                next_state, reward, done = challenge1.parseState(vessel, next_state, game)
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                episode_rewards.append(reward)
                state = next_state

        score = np.sum(episode_rewards)
        print("Score: ", score)


Load and test model
INFO:tensorflow:Restoring parameters from ./modules/model.ckpt
Score:  132
