In [None]:
from diplomacy_gym_environment import DiplomacyEnvironment
import random
import time
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from IPython import display

# This is a template setup on how to actually use the Gym environment I made, by setting up a RL agent.


def random_move():
    actions = {}
    for power_name in env.game.powers.keys():
        actions[power_name] = [random.choice(env.game.get_all_possible_orders()[loc]) for loc in
                               env.game.get_orderable_locations(power_name)]
    return actions


def random_nn_move():
    actions = {}
    for power_name in env.game.powers.keys():
        actions[power_name] = np.array([random.random() for _ in env.action_list])
    return actions


def random_vs_nonrandom_move(probs):
    actions = {}
    for power_name in env.game.powers.keys():
        if power_name == 'AUSTRIA':
            actions[power_name] = np.array(probs)
        else:
            actions[power_name] = np.array([random.random() for _ in env.action_list])
    return actions


def create_model():
    inputs = layers.Input(shape=(num_inputs,))
    common1 = layers.Dense(num_middle, activation="relu")(inputs)
    common2 = layers.Dense(num_middle, activation="relu")(common1)
    common3 = layers.Dense(num_middle, activation="relu")(common2)
    common4 = layers.Dense(num_middle, activation="relu")(common3)
    actor = layers.Dense(num_actions, activation="sigmoid")(common4)
    critic = layers.Dense(1)(common4)

    return keras.Model(inputs=inputs, outputs=[actor, critic])


def visualize_state(saved_display, rendering):
    # display actions committed state
    saved_display.update(display.SVG(rendering))
    time.sleep(wait_time)

    # display following state
    rendering = env.render()
    saved_display.update(display.SVG(rendering))
    time.sleep(wait_time)

def choose_action(action_probs, epsilon):
    roll = np.random.rand(1)[0]
    if epsilon > roll:
        # Take random action
        action = random_nn_move()
    else:
        # Take best action
        #action = tf.argmax(action_probs[0]).numpy()
        action = random_vs_nonrandom_move(action_probs[0])
    return action

In [None]:
# My own custom-made gym environment
env = DiplomacyEnvironment(prints=False, render_path=None)

# various other settings
seed = 42
wait_time = 0.1
print_level = 0
visualize_every = 10
eps = np.finfo(np.float32).eps.item()

# setting Epsilon (exploration vs. exploitation parameter)
# discount factor
gamma = 0.99
epsilon = 1
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
# number of frames to go from epsilon_min to epsilon_max
epsilon_greedy_episodes = 1000
batch_size = 32
learning_rate = 0.01

# size of NN layers
num_inputs = env.observation_space.n
num_actions = env.action_space.shape[0]
num_middle = 1024


if print_level >= 1:
    print(f'input layer size: {num_inputs}')
    print(f'4 x middle layer size: {num_middle}')
    print(f'output layer size: {num_actions}')

# defining learning model
model = create_model()
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
loss_function = keras.losses.Huber()

# stats to keep track of
action_history = []
action_probs_history = []
critic_value_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []

#keep track of how healthy the network is
running_reward = 0
episode_count = 0
total_frame_count = 0

# not sure about these values, has to do with batching, experience replay and not updating the network after every episode / step
'''
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
# Train the model after 4 actions
update_after_actions = 100
# How often to update the target network
update_target_network = 100
'''

saved_display = display.display("This text should be replaced by rendering...", display_id=True)

while True:
    with tf.GradientTape() as tape:
        # this is the start of an episode
        done = False
        # setting to use highest action every time rather than probability distribution
        # currently exploiter mode is the default and probability mode has not been implemented so this setting does nothing yet.
        exploiter = False
        state = np.array(env.reset())
        episode_reward = 0
        frame_count = 0

        with tqdm(desc=f"episode {episode_count} steps") as pbar:
            while not done:
                # this is the start of a frame
                pbar.update(1)

                # convert state to tensor
                state_tensor = tf.convert_to_tensor(state)
                state_tensor = tf.expand_dims(state_tensor, 0)

                # Use network
                action_probs, critic_value = model(state_tensor, training=False)

                # Coose action based on policy
                action = choose_action(action_probs, epsilon)
                # TODO at a certain print level print level translate action back to common names and print them as well as their probability

                # Apply the sampled action in our environment
                state_next, reward, done, info, rendering = env.step(action, render=True)
                # TODO at a certain print level print expected value, reward, and expected value of next_state as well as the relations between these

                state_name, info = info
                episode_reward += reward[0]
                reward = reward[0]
                done = done[0]
                state_next = np.array(state_next[0])
                action_mask = np.zeros(num_actions)
                action_mask[info['AUSTRIA']] = 1

                if episode_count % visualize_every == 0:
                    visualize_state(saved_display, rendering)

                # Save stats in replay buffer
                action_history.append(action_mask)
                action_probs_history.append(action['AUSTRIA'])
                critic_value_history.append(critic_value[0, 0])
                state_history.append(state)
                state_next_history.append(state_next)
                done_history.append(done)
                rewards_history.append(reward)

                # Change state
                state = state_next

                # this is the end of a frame
                frame_count += 1
                total_frame_count += 1

            # start of end of episode here
            pbar.close()

            # Update running reward to check condition for solving
            running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

            # Decay probability of taking random action
            epsilon -= epsilon_interval / epsilon_greedy_episodes
            epsilon = max(epsilon, epsilon_min)

            # Calculate expected value from rewards
            # - At each timestep what was the total reward received after that timestep
            # - Rewards in the past are discounted by multiplying them with gamma
            # - These are the labels for our critic
            returns = []
            discounted_sum = 0
            for r in rewards_history[::-1]:
                discounted_sum = r + gamma * discounted_sum
                returns.insert(0, discounted_sum)

            # Normalize
            returns = np.array(returns)
            returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
            returns = returns.tolist()

            # Calculating loss values to update our network
            history = zip(action_probs_history, critic_value_history, returns)
            actor_losses = []
            critic_losses = []
            for log_prob, value, ret in history:
                # At this point in history, the critic estimated that we would get a
                # total reward = `value` in the future. We took an action with log probability
                # of `log_prob` and ended up recieving a total reward = `ret`.
                # The actor must be updated so that it predicts an action that leads to
                # high rewards (compared to critic's estimate) with high probability.
                diff = ret - value
                actor_losses.append(-log_prob * diff)  # actor loss

                # The critic must be updated so that it predicts a better estimate of
                # the future rewards.
                critic_losses.append(
                    loss_function(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                )

            # Backpropagation
            loss_value = sum(actor_losses) + sum(critic_losses)
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # End of tape

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
        action_history.clear()
        state_history.clear()
        state_next_history.clear()
        done_history.clear()

        # Log details
        episode_count += 1 
        print(f"episode {episode_count} finished (frame {total_frame_count}) - running reward: last state: {state_name}, {running_reward:.2f}, episode reward: {episode_reward:.2f}, actor loss = {np.mean(actor_losses)}, critic loss: {np.mean(critic_losses)}")

        # end of end of episode here

# Earlier scrap based on Q-learning
'''
    # Update every fourth frame and once batch size is over 32
    if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
        # Get indices of samples for replay buffers
        indices = np.random.choice(range(len(done_history)), size=batch_size)

        # Using list comprehension to sample from replay buffer
        state_sample = np.array([state_history[i] for i in indices])
        state_next_sample = np.array([state_next_history[i] for i in indices])
        rewards_sample = [rewards_history[i] for i in indices]
        action_sample = [action_history[i] for i in indices]
        done_sample = tf.convert_to_tensor(
            [float(done_history[i]) for i in indices]
        )

        # Build the updated Q-values for the sampled future states
        # Use the target model for stability
        future_rewards = model_target.predict(state_next_sample)
        # Q value = reward + discount factor * expected future reward
        updated_q_values = rewards_sample + gamma * tf.reduce_max(
            future_rewards, axis=1
        )

        # If final frame set the last value to -1
        updated_q_values = updated_q_values * (1 - done_sample) - done_sample

        # Create a mask so we only calculate loss on the updated Q-values
        # updated mask to just be probabilities
        masks = tf.convert_to_tensor(action_sample, dtype=tf.float32)

        with tf.GradientTape() as tape:
            # Train the model on the states and updated Q-values
            q_values = model(state_sample)

            # Apply the masks to the Q-values to get the Q-value for action taken
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            # Calculate loss between new Q-value and old Q-value
            loss = loss_function(updated_q_values, q_action)

        # Backpropagation
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    if frame_count % update_target_network == 0:
        # update the the target network with new weights
        model_target.set_weights(model.get_weights())
        # Log details
        template = "running reward: {:.2f} at episode {}, frame count {}"
        print(template.format(running_reward, episode_count, frame_count))

    # Limit the state and reward history
    if len(rewards_history) > max_memory_length:
        del rewards_history[:1]
        del state_history[:1]
        del state_next_history[:1]
        del action_history[:1]
        del done_history[:1]

    if done:
        break

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1
    print("episode: {}, total reward: {:.2f}".format(episode_count, episode_reward))

    if episode_count >= 1000:
        model.save('model/test_model')
        break
'''