# Curiosity-Based Prioritized Experience Replay

## Imports and Initialization

In [None]:
# External Libraries

import os
import random
import numpy as np
import tensorflow as tf
import time
import pandas as pd
from ast import literal_eval
import matplotlib.pyplot as plt
import json

# Visualization of gameplay inside notebook
from IPython.display import clear_output


In [None]:
# Load Project modules

from envs import make_env
from agents import build_agent
from prioritizers.priority_combinator import PriorityCombinator
from prioritizers.rnd_prioritizer import RNDPrioritizer
from prioritizers.aesh_prioritizer import AESHPrioritizer
from prioritizers.icm_prioritizer import ICMPrioritizer
from prioritizers.td_prioritizer import TDPrioritizer
from training import train
from utils.eval_scores import *
from utils.plot import *

In [None]:
# CHECKING TENSORFLOW SUPPORT FOR NVIDIA GPU

print("Checking available GPUs...")
gpu_available = len(tf.config.list_physical_devices('GPU')) > 0
if gpu_available:
    print("GPU available")
else:
    print("GPU not available")

print(tf.config.list_physical_devices('GPU'))

In [None]:
# Definition of Gym envs params

envs_params = {
    #### BASIC GYM ENVs
    "CartPole-v1" : {
        "max_timesteps":200,
        "conv": False
    },
    "MountainCar-v0" : {
        "max_timesteps":200,
        "conv": False
    },
    "Acrobot-v1" : {
        "max_timesteps":200,
        "conv": False
    },
    #### Require:
    #### pip install Box2D
    "LunarLander-v2" : {
        "max_timesteps":1000,
        "conv": False
    },
    ##### ATARI GAMES
    "ALE/WizardOfWor-v5" : {
        "max_timesteps":5000,
        "conv": True
    },
    "ALE/Freeway-v5" : {
        "max_timesteps":5000,
        "conv": True
    },
    "ALE/Gravitar-v5" : {
        "max_timesteps":5000,
        "conv": True
    },
    "ALE/MontezumaRevenge-v5" : {
        "max_timesteps":2000,
        "conv":True
    },
    "ALE/Venture-v5" : {
        "max_timesteps":2000,
        "conv":True
    },
    "ALE/Breakout-v5" : {
        "max_timesteps":2000,
        "conv":True
    },
    "ALE/Pong-v5" : {
        "max_timesteps":2000,
        "conv":True
    }
}

In [None]:
# Define testing environment

env_name = "MountainCar-v0"

env_conv = envs_params[env_name]["conv"]
use_max_timesteps = False
clip_reward_training = False

if use_max_timesteps:
    max_episode_steps = envs_params[env_name]["max_timesteps"]
else:
    max_episode_steps = None #1e7

env = make_env(env_name, env_conv, clip_reward=clip_reward_training, max_episode_steps=max_episode_steps)
eval_env = make_env(env_name, env_conv, clip_reward=False, max_episode_steps=max_episode_steps)

print("env.action_space: {}".format(env.action_space))
print("env.observation_space: {}".format(env.observation_space))
print("env.observation_space.shape: {}".format(env.observation_space.shape))
print("env.action_space.n: {}".format(env.action_space.n))

input_shape = env.observation_space.shape # Shape of a game state / observation
n_actions = env.action_space.n # Number of possible actions available each turn for the agent

In [None]:
# if Atari, Show a frame of the game
if env_conv:
    debug_env = make_env(env_name, env_conv, max_episode_steps=max_episode_steps, render_mode="rgb_array")
    debug_obs = debug_env.reset(seed=1)[0]
    for i in range(100):
        debug_obs, _, _, _, _ = debug_env.step(1)
    plt.imshow(debug_obs[0])
    plt.show()

## Training

In [None]:
# Training session and Hyper-parameters definition

# List of models to test (it is possible to split the models in order to perform the testing in multiple sessions)
models_to_test = ["AESH", "ICM", "DIST", "TD", "UNIFORM"]


# If the actual configuration has to be loaded from an external confi file.
# If False, it will use the following parameters.
load_config_from_json = True
config_file_name = "Generic-DQN-400K.json"

# ATTENTION: These params are used only if load_config_from_json == False
agent_params = {
    "DQN": {
        "lr": 1e-3,
        "grad_clipping":None,
        "gamma":0.99
    },
    "SAC": {
        "lr_critic":1e-3,
        "lr_actor":1e-3,
        "lr_alpha":1e-3,
        "gamma":0.99
    }
}
buffer_params = {
    "buffer_size": 300_000,
    "prioritized_replay_alpha": 0.8,
    "prioritized_replay_beta0": 0.2,
    "exploration_fraction": 0.2,
    "exploration_final_eps": 0.001
}
training_params = {
    "agent_train_freq": 4,
    "total_timesteps": 300_000,
    "learning_starts": 1000,
    "batch_size": 64,
    "eval_freq": 3000, # in timesteps
    "k_rollouts": 10,
    "buffer_prs_plot_freq": 30000
}
priority_combinator_params = {
    "w_ic": 1,
    "w_td": 0,
    "w_rw": 0,
}
rnd_prioritizer_params = {
    "learning_rate": 1e-3
}
aesh_prioritizer_params = {
    "k": 64 if env_conv else 32, # Length of the binary code
    "encoder_output_size": 256 if env_conv else 32,
    "lambda_regularizer": 10,
    "beta": 1, # p = beta/sqrt(N(phi(s)))
    "learning_rate": 1e-3
}
icm_prioritizer_params = {
    "embedding_size": 288 if env_conv else 32, #288,
    "beta": 0.2,
    "learning_rate": 1e-3
}

hyper_params_dict = {
    "selected_agent": "DQN", #Choose between DQN or SAC,
    "use_priority_combinator": True, # True to use the priority combinator to normalize and combine the priorities
    "agent_params": agent_params,
    "buffer_params": buffer_params,
    "training_params": training_params,
    "priority_combinator_params": priority_combinator_params,
    "prioritizers_params": {
        "DIST": rnd_prioritizer_params,
        "AESH": aesh_prioritizer_params,
        "ICM": icm_prioritizer_params
    },
    "episodes_print_freq": 50,
    "smoothing_window_size": 1
}


In [None]:
# Write configuration file
'''
conifg_file_path = os.path.join("configs", "MountainCarConfig.json")
with open(conifg_file_path, 'w', encoding='utf-8') as f:
    json.dump(hyper_params_dict, f, ensure_ascii=False, indent=4)
'''

# Load configuration file
if load_config_from_json:
    # Read a configuration file
    conifg_file_path = os.path.join("configs", config_file_name)
    with open(conifg_file_path, 'r') as file:
        hyper_params_dict = json.load(file)

In [None]:
# Create plots directory if it not exists
if not os.path.exists("plots"):
    os.makedirs("plots")

dir_env_plots = os.path.join("plots", format_plots_path_name(env_name=env_name, suffix=hyper_params_dict["selected_agent"]))
if not os.path.exists(dir_env_plots):
    os.makedirs(dir_env_plots)

# Create dfs directory if it not exists
if not os.path.exists("dfs"):
    os.makedirs("dfs")

dir_env_dfs = os.path.join("dfs", format_plots_path_name(env_name=env_name, suffix=hyper_params_dict["selected_agent"]))
if not os.path.exists(dir_env_dfs):
    os.makedirs(dir_env_dfs)

# Create models directory if it not exists
if not os.path.exists("models_checkpoints"):
    os.makedirs("models_checkpoints")

dir_env_checkpoints = os.path.join("models_checkpoints", format_plots_path_name(env_name=env_name, suffix=hyper_params_dict["selected_agent"]))
if not os.path.exists(dir_env_checkpoints):
    os.makedirs(dir_env_checkpoints)

# Create configs directory if it not exists
if not os.path.exists("configs"):
    os.makedirs("configs")

In [None]:
def set_global_seed(seed):    
    # Python Random Library
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Tensorflow
    tf.random.set_seed(seed)

In [None]:
def plot_rewards_together(models_to_plot, selected_agent):
    # Load previously saved eval returns of models
    # It's usefull if the training of the different models is performed in different sessions
    eval_returns_list = []
    for model in models_to_plot:
        eval_returns = pd.read_csv(os.path.join(dir_env_dfs, "eval_returns_"+selected_agent+"_"+model+".csv"), converters={"y": literal_eval})
        eval_returns_list.append(eval_returns)
    # Plot together all the reward curves
    print_results_from_dataframe_ci(eval_returns_list,
                                    models_to_plot,
                                    file_name=format_plots_path_name(dir=dir_env_plots,
                                                                     env_name=env_name,
                                                                     suffix="reward_all"+"_"+selected_agent))

In [None]:
def combine_rollouts_dfs(dfs_list, x_name="x", y_name="y"):
    # Starting from the dfs with the single-run evaluations, create a single dataframe with all the 
    # evaluations in a single column as a list.
    combined_df = pd.DataFrame(data={x_name:dfs_list[0][x_name]})
    indices_list = []
    for k, df_k in enumerate(dfs_list):
        indices_list.append(k)
        combined_df[k] = df_k[y_name]
    combined_df[y_name] = combined_df[indices_list].values.tolist()
    combined_df.drop(columns=indices_list, inplace=True)
    return combined_df

def train_models(models_to_test,
                 hyper_params_dict,
                 env,
                 eval_env,
                 train_seeds,
                 eval_seeds,
                 env_conv,
                 env_name,
                 dir_env_plots,
                 dir_env_dfs,
                 dir_env_checkpoints):
    input_shape = env.observation_space.shape # Shape of a game state / observation
    n_actions = env.action_space.n # Number of possible actions available each turn for the agent
    selected_agent = hyper_params_dict["selected_agent"]
    k_rollouts = hyper_params_dict["training_params"]["k_rollouts"]
    use_priority_combinator = hyper_params_dict["use_priority_combinator"]
    episodes_print_freq = hyper_params_dict["episodes_print_freq"]
    smoothing_window_size = hyper_params_dict["smoothing_window_size"]
    for model in models_to_test:
        # Pretty Printing
        print("###################################################")
        print("TRAINING "+model+" MODEL:")
        
        eval_dfs_list = []
        pr_hist_list = []
        buffer_prs_list = []
        start_time = time.time()
        for rollout_n in range(k_rollouts):
            print("ROLLOUT N. "+str(rollout_n+1)+":")
            
            # Set global seed for the model
            set_global_seed(train_seeds[rollout_n])
            
            # Does the model use a prioritized replay?
            prioritized_replay = model != "UNIFORM"

            # Build agent
            agent = build_agent(selected_agent, env, env_conv, hyper_params_dict["agent_params"][selected_agent])

            # Build priority combinator
            if use_priority_combinator and prioritized_replay:
                priority_combinator = PriorityCombinator(**hyper_params_dict["priority_combinator_params"])
            else:
                priority_combinator = None

            # Build prioritizer
            if model == "DIST":
                prioritizer = RNDPrioritizer(input_shape, n_actions, conv=env_conv,
                                             params_dict=hyper_params_dict["prioritizers_params"][model],
                                             priority_combinator=priority_combinator)
            elif model == "ICM":
                prioritizer = ICMPrioritizer(input_shape, n_actions, conv=env_conv,
                                             params_dict=hyper_params_dict["prioritizers_params"][model],
                                             priority_combinator=priority_combinator)
            elif model == "AESH":
                prioritizer = AESHPrioritizer(input_shape, n_actions, conv=env_conv,
                                              params_dict=hyper_params_dict["prioritizers_params"][model],
                                              priority_combinator=priority_combinator)
            elif model == "TD":
                prioritizer = TDPrioritizer()
            elif model == "UNIFORM":
                prioritizer = None
            else:
                raise Exception("Incorrect model specification")

            # Perform training
            env.reset(seed=train_seeds[rollout_n])
            eval_env.reset(seed=eval_seeds[rollout_n])
            _, evaluations, ep_returns, pr_hist, buffer_prs = train(agent, env, eval_env, prioritizer,
                                                                     episodes_print_freq=episodes_print_freq,
                                                                     buffer_params=hyper_params_dict["buffer_params"],
                                                                     training_params=hyper_params_dict["training_params"],
                                                                     prioritized_replay=prioritized_replay)
            eval_dfs_list.append(evaluations)
            pr_hist_list.append(pr_hist)
            buffer_prs_list.append(buffer_prs)
            print()
            print("***************************************************")
        print(f"Training time: {(time.time()-start_time)/60} minutes")
        
        # Combine rollouts dfs in a single dataframe
        eval_returns_comb = combine_rollouts_dfs(eval_dfs_list, x_name="x", y_name="y")
        
        # Save training data
        # Episodes rewards
        print_ep_rewards_from_dataframe(ep_returns, model,
                                        file_name=format_plots_path_name(dir=dir_env_plots,
                                                                         env_name=env_name,
                                                                         suffix="ep_"+selected_agent+"_"+model))
        # Step reward
        print_results_from_dataframe_ci([eval_returns_comb],
                                        [model],
                                        file_name=format_plots_path_name(dir=dir_env_plots,
                                                                         env_name=env_name,
                                                                         suffix="reward_"+selected_agent+"_"+model),
                                        ylabel="Reward")
        # Save also as a csv
        eval_returns_comb.to_csv(os.path.join(dir_env_dfs, "eval_returns_"+selected_agent+"_"+model+".csv"), index=False)
        
        if prioritizer is not None:
            # Combine rollouts dfs in a single dataframe
            pr_hist_comb = combine_rollouts_dfs(pr_hist_list, x_name="x", y_name="y")
            buffer_prs_comb = combine_rollouts_dfs(buffer_prs_list, x_name="tms", y_name="prs")
            
            # Priority history
            print_results_from_dataframe_ci([pr_hist_comb],
                                            [model],
                                            file_name=format_plots_path_name(dir=dir_env_plots,
                                                                             env_name=env_name,
                                                                             suffix="pr_hist_"+selected_agent+"_"+model),
                                            ylabel="Max Priority")
            
            # Priorities stored in the buffer during training
            print_buffer_priorities_from_dataframe(buffer_prs_comb,
                                                   file_name=format_plots_path_name(dir=dir_env_plots,
                                                                                    env_name=env_name,
                                                                                    suffix="buffer_prs_"+selected_agent+"_"+model))
            
            # Save priority history and priorities stored in the buffer as csv
            pr_hist_comb.to_csv(os.path.join(dir_env_dfs, "pr_hist_"+selected_agent+"_"+model+".csv"), index=False)
            buffer_prs_comb.to_csv(os.path.join(dir_env_dfs, "buffer_prs_"+selected_agent+"_"+model+".csv"), index=False)
            
        # Save trained agent
        agent.save_model(dir_env_checkpoints, (selected_agent+"_"+model))
        
        # Pretty Printing
        print()
        print()
    
    # Plot together all the reward curves
    plot_rewards_together(models_to_test, selected_agent)

In [None]:
# Random seed for producing all the other seeds
set_global_seed(42)

if False:
    # Generate Gym random seeds
    train_seeds = []
    eval_seeds = []
    k_rollouts = hyper_params_dict["training_params"]["k_rollouts"]
    max_seed_n = 1000
    for i in range(k_rollouts):
        train_seeds.append(np.random.randint(max_seed_n))
        eval_seeds.append(np.random.randint(max_seed_n))
    print("GYM TRAINING SEEDS: "+str(train_seeds))
    print("GYM EVALUATION SEEDS: "+str(eval_seeds))
    
else:
    train_seeds = [102, 860, 106, 700, 614, 466, 330, 87, 99, 663]
    eval_seeds = [435, 270, 71, 20, 121, 214, 458, 372, 871, 130]


In [None]:
train_models(models_to_test,
             hyper_params_dict,
             env,
             eval_env,
             train_seeds,
             eval_seeds,
             env_conv,
             env_name,
             dir_env_plots,
             dir_env_dfs,
             dir_env_checkpoints)

In [None]:
# Plot together the reward curves of all 5 models
plot_rewards_together(["AESH", "ICM", "DIST", "TD", "UNIFORM"], hyper_params_dict["selected_agent"])

In [None]:
# Get average final scores
eval_scores = compute_eval_scores_tms(dir_env_dfs, ["AESH", "ICM", "DIST", "TD", "UNIFORM"],
                                      hyper_params_dict["selected_agent"])

# Print final scores
print_eval_scores(eval_scores)

In [None]:
# PRINT COLORED REWARDS AND PRIORITIES (eventually overrides previous mono-colored plots)

color_index = 0
for model in ["AESH", "ICM", "DIST", "TD", "UNIFORM"]:
    color = "C" + str(color_index)
    # PLOT REWARD
    eval_returns = pd.read_csv(os.path.join(dir_env_dfs, "eval_returns_"+selected_agent+"_"+model+".csv"), converters={"y": literal_eval})
    print_results_from_dataframe_ci([eval_returns],
                                    [model],
                                    file_name=format_plots_path_name(dir=dir_env_plots,
                                                                     env_name=env_name,
                                                                     suffix="reward_"+selected_agent+"_"+model),
                                    color=color)

    # PLOT PRIORITIES
    pr_hist = pd.read_csv(os.path.join(dir_env_dfs, "pr_hist_"+selected_agent+"_"+model+".csv"), converters={"y": literal_eval})
    print_results_from_dataframe_ci([pr_hist],
                                    [model],
                                    file_name=format_plots_path_name(dir=dir_env_plots,
                                                                     env_name=env_name,
                                                                     suffix="pr_hist_"+selected_agent+"_"+model),
                                    ylabel="Max Priority",
                                    color=color)
    
    color_index = color_index + 1




## Play a Game with the Curiosity-Agent

In [None]:
# load model
selected_agent = hyper_params_dict["selected_agent"]
agent = build_agent(selected_agent, env, env_conv, hyper_params_dict["agent_params"][selected_agent])
agent.load_model(dir_env_checkpoints, selected_agent+"_"+"ICM")

In [None]:
if True:
    model = agent
    play_env = make_env(env_name, env_conv, clip_reward=False, max_episode_steps=None, render_mode="rgb_array")
    for episode in range(1): # Play one game
        obs = play_env.reset()[0]
        total_rew = 0
        t = 0
        while True:
            #env.render()
            
            clear_output(wait=True)
            plt.imshow(play_env.render())
            plt.show()
            
            obs = np.expand_dims(np.array(obs), axis=0)
            obs_tf = tf.constant(obs)
            action = model.step(obs_tf, stochastic=False)
            
            obs, rew, done, truncated, _  = play_env.step(int(action))
            total_rew = total_rew + rew
            t = t + 1
            
            if done or truncated:
                break
        print("Episode finished after {} timesteps".format(t+1))
        print("Score: {}".format(total_rew))
    play_env.close()