# Prepare

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datetime import datetime
from scipy.stats import entropy
import scipy as sp

# Two Step Task
from two_step_task import *
env = two_step_task()

2023-01-03 09:34:13.580579: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Hyperparameters
gamma = 0.9  # Discount factor for past rewards
nb_steps = 100
nb_episodes = 10000
learning_rate = 7e-4
bootstrap_n = 10

beta_v = 0.05
beta_e = 0.05

In [3]:
# Save Paths
path = "train/" + datetime.now().strftime("%m%d-%H:%M:%S")
log_dir = path+'/logs/'
ckpt_dir = path+'/ckpt/'
train_summary_writer = tf.summary.create_file_writer(log_dir)

# LOSS

In [4]:
def compute_loss(
        action_probs: tf.Tensor,
        values: tf.Tensor,
        rewards: tf.Tensor,
        entropy : tf.Tensor) -> tf.Tensor:
    """Computes the combined actor-critic loss."""
    
    bootstrap_n = tf.shape(rewards)[0]
    
    R_t = []
    rewards = rewards[::-1]
    
    discounted_sum = values[-1]
    for i in tf.range(bootstrap_n):
        discounted_sum = rewards[i] + gamma * discounted_sum
        R_t.append(discounted_sum)
    
    #R_t = tf.convert_to_tensor(R_t)
    R_t = tf.convert_to_tensor(R_t[::-1])
    delta = R_t - values

    critic_loss = beta_v * 0.5 * tf.reduce_sum(tf.square(delta * values))
    #critic_loss = beta_v * tf.reduce_sum(delta * values)
    
    #log_actor = tf.math.log(tf.reduce_sum(action_probs) + 1e-7)
    #actor_loss = tf.reduce_sum(log_actor * delta)
    actor_loss = -tf.reduce_sum(tf.math.log(action_probs + 1e-7) * delta)
    
    entropy = beta_e * entropy

    total_loss = actor_loss + critic_loss + entropy

    return total_loss, actor_loss, critic_loss, entropy

## LSTM

In [5]:
num_inputs = 7 #states + reward + action + timestep= 3 + 1 + 2 + 1
num_actions = 2
num_hidden = 48

inputs = layers.Input(shape=(num_inputs))
state_h = layers.Input(shape=(num_hidden))
state_c = layers.Input(shape=(num_hidden))

common, states = layers.LSTMCell(num_hidden)(inputs, states=[state_h, state_c], training=True)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=[inputs,state_h,state_c], outputs=[action, critic, states], )

#model.save('init.h5')

# Train

In [6]:
optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
running_reward = 0
episode_count = 0
#model = keras.models.load_model('init.h5')
  

for episode in range(nb_episodes):  # Run until solved
    state = env.reset()
    action_probs_history = []
    critic_value_history = []
    rewards_history = []
    episode_reward = 0
    reward = 0.0
    action_onehot = np.zeros((2))
    cell_state = [tf.zeros((1,num_hidden)),tf.zeros((1,num_hidden))]
    
    episode_entropy = tf.zeros(())
    
    with tf.GradientTape() as tape:
        for timestep in range(nb_steps):
            
            input = np.append(state.numpy(),action_onehot)
            input = np.append(input,reward)
            input = np.append(input,timestep)
            input = tf.reshape(input, (1, num_inputs))

            # Predict action probabilities and estimated future rewards from environment state
            action_probs, critic_value, cell_state = model([input,cell_state[0],cell_state[1]])
            
            critic_value_history.append(tf.squeeze(critic_value))

            # Sample action from action probability distribution
            action_probs = tf.squeeze(action_probs)
            action = np.random.choice(num_actions, p=action_probs.numpy())
            action_probs_history.append(action_probs[action])
            action_onehot[action] = 1

            # Apply the sampled action in our environment
            state, reward, done, _ = env.trial(action)
            #state, reward, done, _ = env.step(np.random.randint(0,2))
            rewards_history.append(reward)
            episode_reward += reward
            
            #entropy
            #entropy = -tf.math.reduce_sum(tf.math.multiply(tmp,tf.math.log(tmp + 1e-7)))
            entropy = sp.stats.entropy(action_probs)
            episode_entropy += entropy
            
            if done: break


        # Calculating loss values to update our network
        total_loss, actor_loss, critic_loss, entropy = compute_loss(
            tf.convert_to_tensor(action_probs_history,dtype=tf.float32), 
            tf.convert_to_tensor(critic_value_history, dtype=tf.float32), 
            tf.convert_to_tensor(rewards_history, dtype=tf.float32), 
            episode_entropy)
        
        # Backpropagation
        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        with train_summary_writer.as_default():
            tf.summary.scalar('loss/total_loss', total_loss, step=episode_count)
            tf.summary.scalar('loss/actor_loss', actor_loss, step=episode_count)
            tf.summary.scalar('loss/critic_loss', critic_loss, step=episode_count)
            tf.summary.scalar('loss/entropy', episode_entropy, step=episode_count)
            tf.summary.scalar('game/reward', episode_reward, step=episode_count)
            tf.summary.histogram('game/action_probs', action_probs_history, step=episode_count)

    # Log details
    episode_count += 1
    if episode_count % 1000 == 0:
        template = "reward: {:.2f} at episode {}"
        print(template.format(episode_reward, episode_count))
        checkpoint = tf.train.Checkpoint(model)
        save_path = checkpoint.save(ckpt_dir+'checkpoints_'+str(episode_count)+'/two_steps.ckpt')

reward: 77.00 at episode 1000
reward: 72.00 at episode 2000
reward: 73.00 at episode 3000
reward: 82.00 at episode 4000
reward: 69.00 at episode 5000
reward: 76.00 at episode 6000
reward: 71.00 at episode 7000
reward: 67.00 at episode 8000
reward: 73.00 at episode 9000
reward: 85.00 at episode 10000


In [7]:
model.save(path+'/model.h5')



In [8]:
%load_ext tensorboard