## Performance measurements of a specific DQN agent


![Cartpole environment](https://nav74neet.github.io/media/blog/openaigym/openaigym.jpg)

#### Includes:

- TensorBoard integration
- Final Policy evaluation
- Best Policy evaluation
- Videos of both policies

### The transformer network

In [1]:
# Set the GPU for training
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [2]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, 'modules/')

In [3]:
from train_eval import *
from transformers_encoders import * 
from utils import *

In [4]:
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.networks import network
from tf_agents.environments import tf_environment
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gym
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.policies import policy_saver
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.trajectories.trajectory import *
from tf_agents.utils import common 
from tf_agents.environments.tf_wrappers import TFEnvironmentBaseWrapper
from tf_agents.utils.common import element_wise_squared_loss, element_wise_huber_loss

encoderList = {1:Encoder_1,2:Encoder_2,3:Encoder_3}

In [22]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())
    return IPython.display.HTML(tag)

### Load the training args

In [32]:
# Choose which agent you want to evaluate and specify the correct path
directory = "videos/Acro_validate_4"
args = pickle.load((open(directory + "/training_args.p","rb")))

In [33]:
args

Namespace(approx_env_boundaries=False, atari=False, batch_size=32, checkpoint_interval=10000, collect_steps_per_iteration=1, custom_last_layer=True, custom_layer_init=1, custom_lr_schedule='No', d_model=64, debug_summaries=False, dff=256, doubleQ=True, encoder_type=3, env='Acrobot-v1', epsilon_greedy=0.1, eval_interval=1000, gamma=0.99, gradient_clipping=True, initial_collect_steps=500, layer_type=3, learning_rate=0.0001, log_interval=1000, loss_function='element_wise_huber_loss', max_horizon=5, normalize_env=False, num_eval_episodes=10, num_heads=4, num_iparallel=1, num_iterations=150000, num_layers=3, output_dir='Acro_validate_4', rate=0.1, replay_buffer_max_length=100000, reward_scale_factor=1.0, run_graph_mode=True, summarize_grads_and_vars=False, summary_flush=10, summary_interval=1000, target_update_period=10, target_update_tau=1, train_steps_per_iteration=1)

### Tensorflow integration
(Works more consistent when manually starting Tensorboard and only loading it)

In [24]:
%load_ext tensorboard

In [None]:
# Load the TensorBoard notebook extension
    %tensorboard --logdir {directory}

### Build Everything according to script

In [34]:
global_step = tf.compat.v1.train.get_or_create_global_step()

baseEnv = gym.make(args.env)
env = suite_gym.load(args.env)
eval_env = suite_gym.load(args.env)
if args.normalize_env == True:
    env = NormalizeWrapper(env,args.approx_env_boundaries,args.env)
    eval_env = NormalizeWrapper(eval_env,args.approx_env_boundaries,args.env)
env = PyhistoryWrapper(env,args.max_horizon,args.atari)
eval_env = PyhistoryWrapper(eval_env,args.max_horizon,args.atari)
tf_env = tf_py_environment.TFPyEnvironment(env)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_env)


In [35]:
tf_env.current_time_step()

TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=<tf.Tensor: shape=(1, 30), dtype=float32, numpy=
array([[ 0.9999146 , -0.013068  ,  0.9998928 , -0.01464336,  0.01964835,
        -0.00381415,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]],
      dtype=float32)>)

In [36]:
q_net = QTransformer(
    tf_env.observation_spec(),
    baseEnv.action_space.n,
    num_layers=args.num_layers,
    d_model=args.d_model,
    num_heads=args.num_heads, 
    dff=args.dff,
    rate = args.rate,
    encoderType = args.encoder_type,
    enc_layer_type=args.layer_type,
    max_horizon=args.max_horizon,
    custom_layer = args.custom_layer_init, 
    custom_last_layer = args.custom_last_layer)

In [37]:
    if args.custom_lr_schedule == "Transformer":    # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(args.d_model,int(args.num_iterations/10))
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    elif args.custom_lr_schedule == "Transformer_low":    # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(int(args.d_model/2),int(args.num_iterations/10)) # --> same schedule with lower general lr
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    elif args.custom_lr_schedule == "Linear": 
        lrs = LinearCustomSchedule(learning_rate,args.num_iterations)
        optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    else:
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)

    if args.loss_function == "element_wise_huber_loss" :
        lf = element_wise_huber_loss
    elif args.loss_function == "element_wise_squared_loss":
        lf = element_wise_squared_loss

In [38]:
    if args.doubleQ == False:          # global step count
        agent = dqn_agent.DqnAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            q_network=q_net,
            epsilon_greedy=args.epsilon_greedy,
            target_update_tau=args.target_update_tau,
            target_update_period=args.target_update_period,
            td_errors_loss_fn =lf,
            optimizer=optimizer,
            gamma=args.gamma,
            reward_scale_factor=args.reward_scale_factor,
            gradient_clipping=args.gradient_clipping,
            debug_summaries=args.debug_summaries,
            summarize_grads_and_vars=args.summarize_grads_and_vars,
            train_step_counter=global_step)
    else:
        agent = dqn_agent.DdqnAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            q_network=q_net,
            epsilon_greedy=args.epsilon_greedy,
            target_update_tau=args.target_update_tau,
            td_errors_loss_fn = lf,
            target_update_period=args.target_update_period,
            optimizer=optimizer,
            gamma=args.gamma,
            reward_scale_factor=args.reward_scale_factor,
            gradient_clipping=args.gradient_clipping,
            debug_summaries=args.debug_summaries,
            summarize_grads_and_vars=args.summarize_grads_and_vars,
            train_step_counter=global_step)
    agent.initialize()

Initializing stuff.Weird Tensor Catch to prevent the model to break Case 2 (Empty Tensor)
Initializing stuff.Weird Tensor Catch to prevent the model to break Case 2 (Empty Tensor)


## Get either the best or the final policy checkpoint, make  video and get AV

In [39]:
train_checkpointer = common.Checkpointer(
    ckpt_dir=directory + "/train",
    agent=agent,
    global_step=global_step,
    max_to_keep=1)
train_checkpointer.initialize_or_restore()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1e4e16ce388>

In [40]:
create_policy_eval_video(agent.policy,eval_env,directory + "/finalAgent", num_episodes =1, fps=25)
compute_avg_return(eval_tf_env, agent.policy , num_episodes=1)



-80.0

In [15]:
policy_checkpointer = common.Checkpointer(
    ckpt_dir= directory + "/train"+ '/policy',
    policy=agent.policy,
    max_to_keep=1,
    global_step=global_step)
policy_checkpointer.initialize_or_restore()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x29b8e113708>

In [16]:
create_policy_eval_video(agent.policy,eval_env,directory + "/bestAgent", num_episodes =1)
compute_avg_return(eval_tf_env, agent.policy , num_episodes=1)



500.0

# Attention display 
##### Seems to be broken from TF version? Needs further investigation

In [17]:
a = tf_env.step(1)

In [None]:
a,b = agent._q_network(a.observation,attention_out = True)

In [None]:
plot_self_attention(b[0], 4)

# Q-value samples

In [20]:
def collect_q (environment,agent,n):
    q_collection = []
    time_step = environment.current_time_step()
    for x in range(n):
        action_step = agent.policy.action(time_step)
        time_step = environment.step(action_step.action.numpy()[0])
        Q = agent._q_network(time_step.observation)
        q_collection.append(Q[0].numpy().tolist())
    return q_collection

In [21]:
collect_q (tf_env,agent,2)


[[[117294710784.0, 147785646080.0]], [[120905121792.0, 152334565376.0]]]

### Display the videos if needed

In [16]:
embed_mp4(directory + "/bestAgent.mp4")

In [17]:
embed_mp4(directory + "/finalAgent.mp4")