In [1]:
## Imports

import numpy as np
from custom_envs.gridworlds import SimpleGridworldEnv

from function_approximators.function_approximators import NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics.pairwise import rbf_kernel

from utils.train_utils import train, solve, train_time
from agents.agents import DQNAgent, LinearAgent, FQIAgent, OnlineGaussianProccessAgent

import operator


In [2]:
## Environment

function_approximators = [NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess]

agents = [DQNAgent, LinearAgent, *[FQIAgent]*5, OnlineGaussianProccessAgent]

RENDER = False
env = SimpleGridworldEnv()

In [3]:
## Configuration Files

# DQN Config
CONFIG_DQN = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "learning_rate": 0.00075,
    "hidden_size": (32,32),
    "target_update_freq": 50,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "plot_loss": False,
    "epsilon": 1,
    "max_deduct": 0.97,
    "decay": 0.25,
    "lr_step_size": 250,
    "lr_gamma": 0.95,
    "max_steps": 50,
    "non_param": False,
}

# Linear Config
CONFIG_LINEAR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "learning_rate": 0.02,
    "target_update_freq": 20,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "plot_loss": False,
    "epsilon": 1,
    "max_steps": 50,
    "poly_degree": 1,
    "max_deduct": 0.97,
    "decay": 0.5,
    "lr_step_size": 250,
    "lr_gamma": 0.99,
    "non_param": False,
}

# Decision Tree Config
CONFIG_DT = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.4,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"criterion":"mse","max_depth": 15, "min_samples_split": 20, "min_samples_leaf": 5},
    "feature_names": ["Cart Position", "Cart Velocity", "Pole Angle", "Pole Angular Velocity", "Action: Push Left", "Action: Push Right"],
    "plot_name": "dt_depth=8",
}

# Random Forest Config
CONFIG_RF = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 5,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.2,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"n_estimators": 5,"max_depth": 15, "min_samples_split": 20, "min_samples_leaf": 5},
}

# Support Vector Regressor Config
CONFIG_SVR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 256,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"kernel":"rbf", "degree": 2, "C": 3},
}


# K-Neighbors Regressor Config
CONFIG_KNR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 256,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"n_neighbors":7, "weights": "distance", "algorithm": "auto", "leaf_size": 30},
}

# Gaussian Process Config
CONFIG_GP = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 10,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"alpha": 1e-10, "normalize_y": False, "kernel":  RBF(length_scale=0.5, length_scale_bounds="fixed")},
}

# Online Gaussian Process Config
CONFIG_GP_Online = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "batch_size": 32,
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"sigma_0": 0.5, "init":-10, "kernel":  rbf_kernel, "epsilon_tol": 0.085, "basis_limit": 1000},
}

CONFIGS = [CONFIG_DQN, CONFIG_LINEAR, CONFIG_DT, CONFIG_RF, CONFIG_SVR, CONFIG_KNR, CONFIG_GP, CONFIG_GP_Online]
onlines = [False, False, False, False, False, False, False, True]
models = ["Neural Network", "Linear Model", "Decision Tree", "Random Forest", "Support Vectors", "K-Neighbours", "Gaussian Process", "Gaussian Process Online"]

In [6]:
## Performance Evaluation

returns = []
train_returns = []
train_times = []
n_seeds=1

#Running DQN
j=0
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _, t, times = train(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j], 
            render=RENDER,
            online=onlines[j])
    env.close()
    returns.append(r)
    train_returns.append(t)
    train_times.append(times)
    



 Run: 1 

HERE


  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 264 returned a mean returns of -50.0
Epsilon = 0.833936
Learning rate = 0.00075
Evaluation at timestep 524 returned a mean returns of -50.0
Epsilon = 0.6321760000000001
Learning rate = 0.0007125
Evaluation at timestep 764 returned a mean returns of -50.0
Epsilon = 0.428088
Learning rate = 0.000676875
Evaluation at timestep 1005 returned a mean returns of -5.999999999999999
Epsilon = 0.22710400000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1254 returned a mean returns of -5.999999999999999
Epsilon = 0.031552000000000024
Learning rate = 0.0006108796874999999
Evaluation at timestep 1502 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2005 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 279 returned a mean returns of -50.0
Epsilon = 0.806776
Learning rate = 0.00075
Evaluation at timestep 509 returned a mean returns of -50.0
Epsilon = 0.618208
Learning rate = 0.0007125
Evaluation at timestep 752 returned a mean returns of -50.0
Epsilon = 0.4304159999999999
Learning rate = 0.000676875
Evaluation at timestep 1006 returned a mean returns of -5.999999999999999
Epsilon = 0.22787999999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1250 returned a mean returns of -5.999999999999999
Epsilon = 0.03465600000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1505 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1753 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2003 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027


  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 259 returned a mean returns of -50.0
Epsilon = 0.837816
Learning rate = 0.00075
Evaluation at timestep 515 returned a mean returns of -50.0
Epsilon = 0.63916
Learning rate = 0.0007125
Evaluation at timestep 756 returned a mean returns of -50.0
Epsilon = 0.439728
Learning rate = 0.000676875
Evaluation at timestep 1002 returned a mean returns of -5.999999999999999
Epsilon = 0.22865599999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1279 returned a mean returns of -5.999999999999999
Epsilon = 0.030776000000000026
Learning rate = 0.0006108796874999999
Evaluation at timestep 1501 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2001 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning r

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 274 returned a mean returns of -50.0
Epsilon = 0.816088
Learning rate = 0.00075
Evaluation at timestep 501 returned a mean returns of -5.999999999999999
Epsilon = 0.6368320000000001
Learning rate = 0.0007125
Evaluation at timestep 753 returned a mean returns of -5.999999999999999
Epsilon = 0.426536
Learning rate = 0.000676875
Evaluation at timestep 1002 returned a mean returns of -5.999999999999999
Epsilon = 0.22710400000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1254 returned a mean returns of -5.999999999999999
Epsilon = 0.031552000000000024
Learning rate = 0.0006108796874999999
Evaluation at timestep 1504 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1755 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2004 returned a mean returns of -5.999999999999999
Epsi

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 254 returned a mean returns of -50.0
Epsilon = 0.827728
Learning rate = 0.00075
Evaluation at timestep 521 returned a mean returns of -50.0
Epsilon = 0.621312
Learning rate = 0.0007125
Evaluation at timestep 750 returned a mean returns of -5.999999999999999
Epsilon = 0.42265600000000003
Learning rate = 0.000676875
Evaluation at timestep 1007 returned a mean returns of -5.999999999999999
Epsilon = 0.22477599999999998
Learning rate = 0.0006430312499999999
Evaluation at timestep 1255 returned a mean returns of -5.999999999999999
Epsilon = 0.030776000000000026
Learning rate = 0.0006108796874999999
Evaluation at timestep 1500 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1752 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2000 returned a mean returns of -5.999999999999999
Epsilon = 0.0300

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 259 returned a mean returns of -50.0
Epsilon = 0.819192
Learning rate = 0.00075
Evaluation at timestep 518 returned a mean returns of -50.0
Epsilon = 0.6275200000000001
Learning rate = 0.0007125
Evaluation at timestep 762 returned a mean returns of -5.999999999999999
Epsilon = 0.42576
Learning rate = 0.000676875
Evaluation at timestep 1000 returned a mean returns of -5.999999999999999
Epsilon = 0.22865599999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1253 returned a mean returns of -5.999999999999999
Epsilon = 0.03232800000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1503 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1754 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2003 returned a mean returns of -5.999999999999999
Epsilon = 0.0300000

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 250 returned a mean returns of -50.0
Epsilon = 0.830056
Learning rate = 0.00075
Evaluation at timestep 503 returned a mean returns of -5.999999999999999
Epsilon = 0.647696
Learning rate = 0.0007125
Evaluation at timestep 776 returned a mean returns of -5.999999999999999
Epsilon = 0.42188000000000003
Learning rate = 0.000676875
Evaluation at timestep 1014 returned a mean returns of -5.999999999999999
Epsilon = 0.24029600000000007
Learning rate = 0.0006430312499999999
Evaluation at timestep 1277 returned a mean returns of -5.999999999999999
Epsilon = 0.047848
Learning rate = 0.0006108796874999999
Evaluation at timestep 1505 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2000 returned a mean returns of -5.999999999999999
Epsilon = 0.030

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 295 returned a mean returns of -50.0
Epsilon = 0.8098799999999999
Learning rate = 0.0007125
Evaluation at timestep 536 returned a mean returns of -5.999999999999999
Epsilon = 0.61976
Learning rate = 0.000676875
Evaluation at timestep 758 returned a mean returns of -5.999999999999999
Epsilon = 0.42265600000000003
Learning rate = 0.000676875
Evaluation at timestep 1004 returned a mean returns of -5.999999999999999
Epsilon = 0.22710400000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1252 returned a mean returns of -5.999999999999999
Epsilon = 0.03310400000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1500 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2000 returned a mean returns of -5.9999999

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 260 returned a mean returns of -5.999999999999999
Epsilon = 0.83704
Learning rate = 0.00075
Evaluation at timestep 518 returned a mean returns of -5.999999999999999
Epsilon = 0.6368320000000001
Learning rate = 0.0007125
Evaluation at timestep 755 returned a mean returns of -50.0
Epsilon = 0.41877600000000004
Learning rate = 0.000676875
Evaluation at timestep 1002 returned a mean returns of -5.999999999999999
Epsilon = 0.23098400000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1250 returned a mean returns of -5.999999999999999
Epsilon = 0.03465600000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1500 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1752 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2005 returned a mean returns of -5.99999999999

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 256 returned a mean returns of -50.0
Epsilon = 0.826952
Learning rate = 0.00075
Evaluation at timestep 513 returned a mean returns of -50.0
Epsilon = 0.63916
Learning rate = 0.0007125
Evaluation at timestep 771 returned a mean returns of -50.0
Epsilon = 0.431968
Learning rate = 0.000676875
Evaluation at timestep 1003 returned a mean returns of -5.999999999999999
Epsilon = 0.22632800000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1251 returned a mean returns of -5.999999999999999
Epsilon = 0.03388000000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1500 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1753 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2004 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning ra

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 288 returned a mean returns of -50.0
Epsilon = 0.815312
Learning rate = 0.0007125
Evaluation at timestep 540 returned a mean returns of -50.0
Epsilon = 0.61976
Learning rate = 0.000676875
Evaluation at timestep 757 returned a mean returns of -50.0
Epsilon = 0.446712
Learning rate = 0.000676875
Evaluation at timestep 1006 returned a mean returns of -5.999999999999999
Epsilon = 0.22555199999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1253 returned a mean returns of -5.999999999999999
Epsilon = 0.03232800000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1501 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1753 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2003 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learnin

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 275 returned a mean returns of -50.0
Epsilon = 0.8254
Learning rate = 0.00075
Evaluation at timestep 537 returned a mean returns of -50.0
Epsilon = 0.6135520000000001
Learning rate = 0.000676875
Evaluation at timestep 753 returned a mean returns of -50.0
Epsilon = 0.42032800000000003
Learning rate = 0.000676875
Evaluation at timestep 1003 returned a mean returns of -5.999999999999999
Epsilon = 0.22943200000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1257 returned a mean returns of -5.999999999999999
Epsilon = 0.06336799999999998
Learning rate = 0.0006108796874999999
Evaluation at timestep 1505 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2004 returned a mean returns of -5.999999999999999
Epsilon = 0.03000000

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 268 returned a mean returns of -50.0
Epsilon = 0.830832
Learning rate = 0.00075
Evaluation at timestep 504 returned a mean returns of -50.0
Epsilon = 0.632952
Learning rate = 0.0007125
Evaluation at timestep 759 returned a mean returns of -5.999999999999999
Epsilon = 0.42498400000000003
Learning rate = 0.000676875
Evaluation at timestep 1000 returned a mean returns of -5.999999999999999
Epsilon = 0.22943200000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1255 returned a mean returns of -5.999999999999999
Epsilon = 0.030776000000000026
Learning rate = 0.0006108796874999999
Evaluation at timestep 1500 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1755 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2004 returned a mean returns of -5.999999999999999
Epsilon = 0.0300

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 261 returned a mean returns of -50.0
Epsilon = 0.836264
Learning rate = 0.00075
Evaluation at timestep 508 returned a mean returns of -5.999999999999999
Epsilon = 0.644592
Learning rate = 0.0007125
Evaluation at timestep 769 returned a mean returns of -5.999999999999999
Epsilon = 0.4195519999999999
Learning rate = 0.000676875
Evaluation at timestep 1039 returned a mean returns of -5.999999999999999
Epsilon = 0.23253599999999996
Learning rate = 0.0006108796874999999
Evaluation at timestep 1255 returned a mean returns of -5.999999999999999
Epsilon = 0.030776000000000026
Learning rate = 0.0006108796874999999
Evaluation at timestep 1501 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1755 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2003 returned a mean returns of -5.999999999999999
Epsi

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 288 returned a mean returns of -50.0
Epsilon = 0.81376
Learning rate = 0.0007125
Evaluation at timestep 523 returned a mean returns of -5.999999999999999
Epsilon = 0.625192
Learning rate = 0.0007125
Evaluation at timestep 770 returned a mean returns of -5.999999999999999
Epsilon = 0.42498400000000003
Learning rate = 0.000676875
Evaluation at timestep 1003 returned a mean returns of -5.999999999999999
Epsilon = 0.22632800000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1253 returned a mean returns of -5.999999999999999
Epsilon = 0.03232800000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1503 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1753 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2005 returned a mean returns of -5.999999999999999
Eps

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 277 returned a mean returns of -50.0
Epsilon = 0.82152
Learning rate = 0.00075
Evaluation at timestep 529 returned a mean returns of -5.999999999999999
Epsilon = 0.61976
Learning rate = 0.0007125
Evaluation at timestep 759 returned a mean returns of -5.999999999999999
Epsilon = 0.42576
Learning rate = 0.000676875
Evaluation at timestep 1000 returned a mean returns of -5.999999999999999
Epsilon = 0.23874399999999996
Learning rate = 0.0006430312499999999
Evaluation at timestep 1251 returned a mean returns of -5.999999999999999
Epsilon = 0.03388000000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1504 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2002 returned a mean returns of -5.999999999999999
Epsilon = 0.030000

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 277 returned a mean returns of -50.0
Epsilon = 0.823848
Learning rate = 0.00075
Evaluation at timestep 508 returned a mean returns of -50.0
Epsilon = 0.6337280000000001
Learning rate = 0.0007125
Evaluation at timestep 752 returned a mean returns of -5.999999999999999
Epsilon = 0.42265600000000003
Learning rate = 0.000676875
Evaluation at timestep 1019 returned a mean returns of -50.0
Epsilon = 0.23641599999999996
Learning rate = 0.0006430312499999999
Evaluation at timestep 1251 returned a mean returns of -5.999999999999999
Epsilon = 0.03388000000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1504 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1756 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2001 returned a mean returns of -5.999999999999999
Epsilon = 0.03000000

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 291 returned a mean returns of -50.0
Epsilon = 0.812984
Learning rate = 0.0007125
Evaluation at timestep 545 returned a mean returns of -50.0
Epsilon = 0.61588
Learning rate = 0.000676875
Evaluation at timestep 788 returned a mean returns of -5.999999999999999
Epsilon = 0.42420800000000003
Learning rate = 0.0006430312499999999
Evaluation at timestep 1001 returned a mean returns of -5.999999999999999
Epsilon = 0.22787999999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1252 returned a mean returns of -5.999999999999999
Epsilon = 0.03310400000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1506 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1755 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2003 returned a mean returns of -5.999999999999999
Epsi

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 273 returned a mean returns of -50.0
Epsilon = 0.826952
Learning rate = 0.00075
Evaluation at timestep 511 returned a mean returns of -50.0
Epsilon = 0.6135520000000001
Learning rate = 0.0007125
Evaluation at timestep 756 returned a mean returns of -5.999999999999999
Epsilon = 0.44128
Learning rate = 0.000676875
Evaluation at timestep 1000 returned a mean returns of -5.999999999999999
Epsilon = 0.23175999999999997
Learning rate = 0.0006430312499999999
Evaluation at timestep 1255 returned a mean returns of -5.999999999999999
Epsilon = 0.030776000000000026
Learning rate = 0.0006108796874999999
Evaluation at timestep 1504 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1754 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2004 returned a mean returns of -5.999999999999999
Epsilon = 0.030000

  0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation at timestep 261 returned a mean returns of -50.0
Epsilon = 0.808328
Learning rate = 0.00075
Evaluation at timestep 500 returned a mean returns of -50.0
Epsilon = 0.6508
Learning rate = 0.0007125
Evaluation at timestep 759 returned a mean returns of -5.999999999999999
Epsilon = 0.42110400000000003
Learning rate = 0.000676875
Evaluation at timestep 1009 returned a mean returns of -5.999999999999999
Epsilon = 0.22632800000000008
Learning rate = 0.0006430312499999999
Evaluation at timestep 1253 returned a mean returns of -5.999999999999999
Epsilon = 0.03232800000000002
Learning rate = 0.0006108796874999999
Evaluation at timestep 1502 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005803357031249999
Evaluation at timestep 1750 returned a mean returns of -5.999999999999999
Epsilon = 0.030000000000000027
Learning rate = 0.0005513189179687499
Evaluation at timestep 2000 returned a mean returns of -5.999999999999999
Epsilon = 0.0300000

In [7]:
with open(f'simplegrid_eval_{models[j]}.csv', 'ab') as eval:
    for i in range(n_seeds):
        np.savetxt(eval, [returns[i]], delimiter=',')

In [8]:
with open(f'simplegrid_train_{models[j]}.csv', 'ab') as train:
    for i in range(n_seeds):
        np.savetxt(train, [train_returns[i]], delimiter=',')
        np.savetxt(train, [train_times[i]], delimiter=',')

In [20]:
## Sample Efficiency Evaluation

n_eps = []
n_steps = []
not_solved = []
n_seeds=30

j=7
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    s, e, n = solve(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j],
            target_return=-6,
            op=operator.ge, 
            render=RENDER,
            online=onlines[j])
    env.close()
    n_eps.append(e)
    n_steps.append(s)
    not_solved.append(n)


 Run: 1 

Ep. timesteps: 6
Total timesteps: 762
Total episodes: 22
Evaluation mean return: -5.999999999999999

 Run: 2 

Ep. timesteps: 6
Total timesteps: 728
Total episodes: 19
Evaluation mean return: -5.999999999999999

 Run: 3 

Ep. timesteps: 6
Total timesteps: 282
Total episodes: 7
Evaluation mean return: -5.999999999999999

 Run: 4 

Ep. timesteps: 6
Total timesteps: 425
Total episodes: 10
Evaluation mean return: -5.999999999999999

 Run: 5 

Ep. timesteps: 6
Total timesteps: 282
Total episodes: 10
Evaluation mean return: -5.999999999999999

 Run: 6 

Ep. timesteps: 6
Total timesteps: 710
Total episodes: 18
Evaluation mean return: -5.999999999999999

 Run: 7 

Ep. timesteps: 6
Total timesteps: 1075
Total episodes: 25
Evaluation mean return: -5.999999999999999

 Run: 8 

Ep. timesteps: 6
Total timesteps: 521
Total episodes: 19
Evaluation mean return: -5.999999999999999

 Run: 9 

Ep. timesteps: 6
Total timesteps: 412
Total episodes: 12
Evaluation mean return: -5.999999999999999



In [21]:
with open(f'simplegrid_se-6_{models[j]}.csv', 'ab') as se:
    np.savetxt(se, [n_eps], delimiter=',')
    np.savetxt(se, [n_steps], delimiter=',')
    np.savetxt(se, [not_solved], delimiter=',')

In [22]:
mean_eps = np.mean(n_eps)
std_eps = np.std(n_eps)
print(f"Average n_eps: {mean_eps}")
print(f"Std n_eps: {std_eps}")
print(f"St.error n_eps: {std_eps/np.sqrt(n_seeds)}")

mean_steps = np.mean(n_steps)
std_steps = np.std(n_steps)
print(f"Average n_steps: {mean_steps}0")
print(f"Std n_steps: {std_steps}")
print(f"St.error n_steps: {std_steps/np.sqrt(n_seeds)}")

print(f"Not solved: {np.sum(not_solved)} runs")

Average n_eps: 21.633333333333333
Std n_eps: 27.45721924902245
St.error n_eps: 5.0129794496848845
Average n_steps: 900.60
Std n_steps: 1383.9435826651315
St.error n_steps: 252.67237284673604
Not solved: 3 runs


In [23]:
## Training time

times = []
for j in range(8):
        time = train_time(env, 
                CONFIGS[j], 
                fa=function_approximators[j], 
                agent = agents[j],
                online=onlines[j])
        env.close()
        times.append(time)

print(time)

5005it [00:06, 732.22it/s]                          
  3%|▎         | 173/5000 [00:00<00:02, 1637.78it/s]-6
5006it [00:03, 1297.35it/s]                          
 11%|█▏        | 566/5000 [00:00<00:01, 3382.90it/s]-8
100%|██████████| 5000/5000 [00:37<00:00, 134.49it/s]
 11%|█         | 549/5000 [00:00<00:01, 3684.31it/s]-6
5007it [02:02, 40.97it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]-8
5001it [03:33, 23.42it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]-6
5003it [09:31,  8.75it/s]
 11%|█         | 531/5000 [00:00<00:01, 2983.59it/s]-6
100%|██████████| 5000/5000 [04:40<00:00, 17.83it/s]
  1%|          | 28/5000 [00:00<00:20, 246.46it/s]-6
5004it [00:56, 89.35it/s]                          -6
56.0056312084198



In [24]:
with open(f'simplegrid_times.csv', 'ab') as t:
    np.savetxt(t, [times], delimiter=',')