In [None]:
# default_exp car

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Car RL

In [None]:
from bfh_mt_hs2020_rl_basics.env import CarEnv
from bfh_mt_hs2020_rl_basics.agent import SimpleAgent, RainbowAgent
from bfh_mt_hs2020_rl_basics.bridge import SimpleBridge, RainbowBridge
from bfh_mt_hs2020_rl_basics.loop import LoopControl

import gym
from gym.spaces import Tuple, Discrete, Box
import numpy as np
import warnings

from types import SimpleNamespace

import torch
from torch.optim import Optimizer, Adam

In [None]:
HYPERPARAMS = {
    'base_setup': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep
        
            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 1000,     # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 10**5,    # simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
        
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 1000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1

            # loop control  
            'loop_bound_avg_reward'       : 0.0,   # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep
      
            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 10**6,    # simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1

            # loop control  
            'loop_bound_avg_reward'       : 50.0,   # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cuda': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cuda",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1

            # loop control  
            'loop_bound_avg_reward'       : 50.0,   # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),

  'buffer_eps_2_cpu': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_lr_cpu': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep
      
            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.00005,  # *learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
  
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cpu_limit': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : True,     # *are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'r_buffer_eps': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep
      
            # agent
            'agent_type'                  : "r",      # *agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # *simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 3,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1

            # loop control  
            'loop_bound_avg_reward'       : 50.0,   # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    })    ,
  'r_buffer_eps_limit': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : True,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep
      
            # agent
            'agent_type'                  : "r",      # *agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # *simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 3,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1

            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cpu_gamma': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cpu_gamma_time': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : True,     # *Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cpu_gamma_time_lr': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : True,     # *Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.00005,  # *learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : -50.0,    # *target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
  'buffer_eps_2_cpu_time_buffer': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : True,     # *Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 100000,   # *size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 20000,    # *initial number of experiences in buffer
            'bridge_batch_size'           : 48,       # *batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : -50.0,    # *target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }), 
  'buffer_eps_2_cpu_time_buffer_lrbg': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : True,     # *Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 100000,   # *size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.001,    # *learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 20000,    # *initial number of experiences in buffer
            'bridge_batch_size'           : 48,       # *batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : -50.0,    # *target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),     
    
    # ab hier mit timestep im state
      'buffer_eps_2_cpu_gamma_time_state': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : True,     # *Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",    # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,  # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.99,     # *discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
    'buffer_eps_2_cpu_timestate': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
    'buffer_eps_2_cpu_timestate_newHiddenL': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 50000,    # size of replay buffer
            'agent_target_net_sync'       : 1000,     # sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 5*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.9,      # discount_factor for reward
            'bridge_initial_population'   : 5000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
    
    'buffer_eps_2_cpu_timestate_newHiddenL2': SimpleNamespace(**{
            # env
            'env_mode_energy_penalty'     : False,    # should there be a -1 point penalty for a used energy unit
            'env_mode_random'             : False,    # does acceleration and decelartion have a random part
            'env_mode_limit_steps'        : False,    # are the maximum possible steps limited
            'env_mode_time_penalty'       : False,    # Penalty for every timestep

            # agent
            'agent_type'                  : "s",      # agent type: s=simple, r=rainbow
            'agent_device'                : "cpu",   # * cpu or cuda
            'agent_gamma_exp'             : 0.9,      # discount_factor for experience_first_last.. shouldn't matter since step_size is only 1
            'agent_buffer_size'           : 100000,    # *size of replay buffer
            'agent_target_net_sync'       : 1000,     # *sync TargetNet with weights of DNN every .. iterations
            'agent_simple_eps_start'      : 1.0,      # simpleagent: epsilon start
            'agent_simple_eps_final'      : 0.02,     # simpleagent: epsilon end
            'agent_simple_eps_frames'     : 8*10**5,    # * simpleagent: epsilon frames -> how many frames until 0.02 should be reached .. decay is linear
            'agent_rain_steps_count'      : 1,        # rainbowagent: steps per iteration
            'agent_rain_prio_replay_alpha': 0.6,      # rainbowagent: prio buffer alpha  
  
            # bridge  
            'bridge_optimizer'            : None,     # Optimizer -> default ist Adam
            'bridge_learning_rate'        : 0.0001,   # learningrate
            'bridge_gamma'                : 0.99,      # discount_factor for reward
            'bridge_initial_population'   : 10000,     # initial number of experiences in buffer
            'bridge_batch_size'           : 32,       # batch_size for training
            'bridge_rain_beta_start'      : 0.4,      # rainbow: start beta for Gewichtung buffer
            'bridge_rain_beta_frames'     : 100000,   # rainbow: iteration when bea reaches 1
    
            # loop control  
            'loop_bound_avg_reward'       : 50.0,     # target avg reward
            'loop_logtb'                  : True,     # Log to Tensorboard Logfile
    }),
    
}
    

In [None]:
def create_control(params: SimpleNamespace, config_name) -> LoopControl:
    
    env = CarEnv(mode_energy_penalty   = params.env_mode_energy_penalty, 
                 mode_random           = params.env_mode_random, 
                 mode_limit_steps      = params.env_mode_limit_steps,
                 mode_time_penalty     = params.env_mode_time_penalty)
    
    if params.agent_type == "s": # simple agent
        agent = SimpleAgent(env, 
                            devicestr  = params.agent_device, 
                            gamma           = params.agent_gamma_exp, 
                            buffer_size     = params.agent_buffer_size,
                            target_net_sync = params.agent_target_net_sync,
                            eps_start       = params.agent_simple_eps_start,
                            eps_final       = params.agent_simple_eps_final,
                            eps_frames      = params.agent_simple_eps_frames,
                           )

        bridge = SimpleBridge(agent=agent,
                            optimizer          = params.bridge_optimizer,
                            learning_rate      = params.bridge_learning_rate,
                            gamma              = params.bridge_gamma,
                            initial_population = params.bridge_initial_population,
                            batch_size         = params.bridge_batch_size,
                           )
    
    if params.agent_type == "r": # rainbow agent
        agent = RainbowAgent(env, 
                            devicestr  = params.agent_device, 
                            gamma              = params.agent_gamma_exp, 
                            buffer_size        = params.agent_buffer_size,
                            target_net_sync    = params.agent_target_net_sync,
                            steps_count        = params.agent_rain_steps_count,
                            prio_replay_alpha  = params.agent_rain_prio_replay_alpha
                           )

        bridge = RainbowBridge(agent=agent,
                            optimizer          = params.bridge_optimizer,
                            learning_rate      = params.bridge_learning_rate,
                            gamma              = params.bridge_gamma,
                            initial_population = params.bridge_initial_population,
                            batch_size         = params.bridge_batch_size,
                            beta_start         = params.bridge_rain_beta_start,
                            beta_frames        = params.bridge_rain_beta_frames
                           )        
    
    control = LoopControl(
                   bridge              = bridge, 
                   run_name            = config_name, 
                   bound_avg_reward    = params.loop_bound_avg_reward,
                   logtb               = params.loop_logtb)
    
    return control

In [None]:
def run_example(config_name: str):
    # get rid of missing metrics warning
    warnings.simplefilter("ignore", category=UserWarning)
    
    control = create_control(HYPERPARAMS[config_name], config_name)
    control.run()

In [None]:
# run_example('base_setup')
# run_example('buffer_eps')
#run_example('buffer_eps_2_cuda')
#run_example('buffer_eps_2_cpu')
#run_example('buffer_eps_lr_cpu')
#run_example('buffer_eps_2_cpu_limit') # abgebrochen ... hat immer hin und hergeschwankt
#run_example('r_buffer_eps') # abgebrochen.. hat ziemlich lange bei einer Episode gedreht
#run_example('r_buffer_eps_limit') # abgebrochen
#run_example('buffer_eps_2_cpu_gamma') # abgebrochen ..
# run_example('buffer_eps_2_cpu_gamma_time') # abgebrochen.. nach ca. 40 minuten
#run_example('buffer_eps_2_cpu_gamma_time_lr') #abgebrochen Achtung: Graf hat zeitlücken
#run_example('buffer_eps_2_cpu_time_buffer') #buffersize doppelt, init auf 20'000, Batchsize 48 -> idee -> ausgogener -> abgebrochen: am Anfang gut uns stabiler
#run_example('buffer_eps_2_cpu_time_buffer_lrbg')# abgebrochen 

# ------------------ timestep im state
#run_example('buffer_eps_2_cpu_gamma_time_state') # abgebrochen
#run_example('buffer_eps_2_cpu_timestate') # abgebrochen kommt nicht ans resultat von buffer_eps_2_cpu ran.. 

# ------------------ NW Grössen angepasst 2 Layer a 4 Neuronen
# run_example('buffer_eps_2_cpu_timestate_newHiddenL') 

# ------------------ NW Grössen angepasst 2 Layer a 16 Neuronen
run_example('buffer_eps_2_cpu_timestate_newHiddenL2')

Episode 1: reward=-1027, steps=172, elapsed=0:00:02
Episode 2: reward=-1046, steps=437, elapsed=0:00:02
Episode 3: reward=-1028, steps=223, elapsed=0:00:02
Episode 4: reward=-1084, steps=327, elapsed=0:00:02
Episode 5: reward=-1002, steps=114, elapsed=0:00:02
Episode 6: reward=-1080, steps=464, elapsed=0:00:02
Episode 7: reward=-1048, steps=338, elapsed=0:00:02
Episode 8: reward=-1064, steps=361, elapsed=0:00:02
Episode 9: reward=-1028, steps=333, elapsed=0:00:02
Episode 10: reward=-1031, steps=213, elapsed=0:00:02
Episode 11: reward=-1026, steps=230, elapsed=0:00:02
Episode 12: reward=-1034, steps=306, elapsed=0:00:02
Episode 13: reward=-1068, steps=443, elapsed=0:00:02
Episode 14: reward=-1098, steps=418, elapsed=0:00:02
Episode 15: reward=-1088, steps=418, elapsed=0:00:02
Episode 16: reward=-1055, steps=294, elapsed=0:00:02
Episode 17: reward=-1104, steps=466, elapsed=0:00:02
Episode 18: reward=-1043, steps=248, elapsed=0:00:02
Episode 19: reward=-1115, steps=418, elapsed=0:00:02
Ep