In [5]:
import time
import os
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from src.algorithms.PPO import PPO
from src.environments.jsbsim.JSBSimEnv import Env #can be jsbsim.JSBSimEnv or xplane.XPlaneEnv
from src.scenarios.deltaAttitudeControlScene import Scene

In [6]:
# Setting the matplotlib style with the seaborn module
sns.set_style("darkgrid")

# Hyperparameters of the PPO algorithm
steps_per_epoch = 5000
epochs = 2000
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (128, 128)
observation_dimensions = 7
num_actions = 4

savePeriod = 25  # every so many epochs the table/model will be saved to a file

# Parameters for logging and actions
pauseDelay = 0.1  # time an action is being applied to the environment
id = "doubleDeep"

experimentName = "Experiment"
connectAttempts = 0.0  # counts everytime the UDP packages are lost on a single retry

# add notes that will be saved to the setup file to clarify the experiment setup better
notes = "This experiment was run with..."

dateTime = str(time.ctime(time.time()))
dateTime = dateTime.replace(":", "-")
dateTime = dateTime.replace(" ", "_")
experimentName = experimentName + "-" + dateTime

errors = 0.0  # counts everytime the UDP packages are lost on all retries

timeStart = time.time()  # used to measure time
timeEnd = time.time()  # used to measure time

# Parameters for training and visualisation
loadModel = False  # will load trained model for tf if True
jsbRender = False # will send UDP data to flight gear for rendering if True
jsbRealTime = False  # will slow down the physics to portrait real time rendering
saveResultsToPlot = True  # Saves results to png in the experiment folder at runtime
usePredefinedSeeds = False  # Sets seeds for tf, np and random for more replicable results
# (not fully replicable due to stochastic environments)

# Parameters for the environment and scenario
startingVelocity = 60
startingPitchRange = 10
startingRollRange = 15
randomDesiredState = True  # Set a new state to stabalize towards every episode
desiredPitchRange = 1
desiredRollRange = 1

dictObservation = {
    "lat": 0,
    "long": 1,
    "alt": 2,
    "pitch": 3,
    "roll": 4,
    "yaw": 5,
    "gear": 6}
dictAction = {
    "pi+": 0,
    "pi-": 1,
    "ro+": 2,
    "ro-": 3,
    "ru+": 4,
    "ru-": 5,
    "no": 6}
dictErrors = {
    "reset": 0,
    "update": 0,
    "step": 0}
dictRotation = {
    "roll": 0,
    "pitch": 1,
    "yaw": 2,
    "northVelo": 3,
    "eastVelo": 4,
    "verticalVelo": 5}

movingEpRewards = {
    "epoch": [],
    "mreturn": [],
    "mlength": [],
    "average": [],
    "reward": [],
    "return": [],
    "length": []}

epochRewards = []
movingRate = savePeriod  # gives the number by which the moving average will be done, best if n * savePeriod

fallbackState = [0] * observation_dimensions  # Used in case of connection error to XPlane
fallbackState = [tuple(fallbackState)]

# -998->NO CHANGE
flightOrigin = [35.126, 126.809, 6000, 0, 0, 0, 1]  # Gwangju SK
flightDestinaion = [33.508, 126.487, 6000, -998, -998, -998, 1]  # Jeju SK
#  Other locations to use: Memmingen: [47.988, 10.240], Chicago: [41.976, -87.902]

stateDepth = 1  # Number of old observations kept for current state. State will consist of s(t) ... s(t_n)


if not os.path.exists("./Experiments/" + experimentName):
    os.makedirs("./Experiments/" + experimentName)

In [None]:
# Initialize scene, environment and PPO agent
scene = Scene(dictObservation, dictAction, num_actions, stateDepth, startingVelocity, startingPitchRange,
              startingRollRange, usePredefinedSeeds, randomDesiredState, desiredPitchRange, desiredRollRange)

env = Env(scene, flightOrigin, flightDestinaion, num_actions, usePredefinedSeeds,
          dictObservation, dictAction, dictRotation, startingVelocity, pauseDelay, id, jsbRender, jsbRealTime)

P = PPO(steps_per_epoch, epochs, gamma, clip_ratio, policy_learning_rate, value_function_learning_rate,
        train_policy_iterations, train_value_iterations, lam, target_kl, hidden_sizes, observation_dimensions,
        num_actions, env, experimentName, loadModel)

In [None]:
# Iterate over the number of epochs
for epoch in range(epochs + 1):
    # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0
    epochReward = 0

    # Iterate over the steps of each epoch
    for t in range(steps_per_epoch):

        # Get the logits, action, and take one step in the environment
        observation = P.observation.reshape(1, -1)
        logits, action = P.sample_action(observation)
        observation_new, reward, done, _ = env.step(action[0].numpy())
        P.episode_return += reward
        P.episode_length += 1
        epochReward += reward

        # checking if state includes a NaN (happens in JSBSim sometimes)
        if np.isnan(observation_new).any():
            if id == "doubleDeep":
                newState = fallbackState
            else:
                newState = 0
            reward = 0
            info = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], 0]
            dictErrors["step"] = "NaN in state"
            errors += 1
            done = True

        # Get the value and log-probability of the action
        value_t = P.critic(observation)
        logprobability_t = P.logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        P.buffer.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = np.asarray(observation_new)

        # print(observation)
        # print(type(observation))

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else P.critic(observation.reshape(1, -1))
            P.buffer.finish_trajectory(last_value)
            sum_return += P.episode_return
            sum_length += P.episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = P.buffer.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = P.train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    # Update the value function
    for _ in range(train_value_iterations):
        P.train_value_function(observation_buffer, return_buffer)

    timeEnd = time.time()  # End timer here
    # Print average reward for each epoch
    print(
        f"Epoch: {epoch}"
        f"\n\tReward: {round(reward,2)}"
        f"\n\tTime Elapsed: {round(timeEnd - timeStart)}s\n"
    )

    epochRewards.append(epochReward)
    averageReward = sum(epochRewards[-movingRate:]) / len(epochRewards[-movingRate:])
    movingEpRewards["average"].append(averageReward)
    movingEpRewards["epoch"].append(epoch + 1)

    # Save graph and model for every n number of epochs
    if epoch % savePeriod == 0 and epoch != 0:
        if saveResultsToPlot:
            plt.plot(movingEpRewards['epoch'], movingEpRewards['average'], label="average reward")
            plt.title("Epoch " + str(epoch) + " Rewards")
            plt.xlabel("Epochs")
            plt.ylabel("Reward")
            plt.legend(loc=0)
            plt.savefig("./Experiments/" + str(experimentName) + "/avgrewardplot" + str(epoch) + ".png")
            plt.show()
            plt.clf()

        P.archive(epoch)
        print("\nSaved Model!\n")

    timeStart = time.time()  # Start timer here

print("<<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>>>>")