# Tune Continuous DQN

In [1]:
import os
import numpy as np
import ray
from ray import tune

# import environment. set directory to find it.
path='/home/lorenzo/Desktop/FirmsPricing_ContObs'
os.chdir(path)
from MA_Firms_Pricing_ContObs import MultiAgentFirmsPricingContinuous

W0803 12:29:22.175943 139735645222720 deprecation.py:323] From /home/lorenzo/anaconda3/envs/tf-rllib-2/lib/python3.6/site-packages/tensorflow/python/compat/v2_compat.py:61: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term


Environment config and other stuff

In [2]:
# initialize the environment with the given configs
ENV_CONFIG = {"num_agents": 2,
              "max_steps":  10**9,
              "p_min":1.2,
              "p_max":2,}
env=MultiAgentFirmsPricingContinuous(env_config=ENV_CONFIG)

# Define policies
def gen_policy():
    return(None, env.observation_space, env.action_space, {})

policy_graphs = dict() 
for i in range(env.num):
    policy_graphs['agent_'+str(i)]=gen_policy()

# Function for mapping agents to policies
def policy_mapping_fn(agent_id):
    return agent_id

In [3]:
# callbacks for custom metrics
def on_episode_start(info):
    episode = info["episode"]
    episode.user_data["delta0"] = []
    episode.user_data["delta1"] = []
    episode.user_data["price0"] = []
    episode.user_data["price1"] = []

def on_episode_step(info):
    episode = info["episode"]
    delta0 = (episode.prev_reward_for(agent_id='agent_0') - 0.22589)/(0.337472 - 0.22589)
    delta1 = (episode.prev_reward_for(agent_id='agent_1') - 0.22589)/(0.337472 - 0.22589)
    price0 = episode.last_raw_obs_for(agent_id='agent_0')[0]
    price1 = episode.last_raw_obs_for(agent_id='agent_0')[1]
    episode.user_data["delta0"].append(delta0)
    episode.user_data["delta1"].append(delta1)
    episode.user_data["price0"].append(price0)
    episode.user_data["price1"].append(price1)

def on_episode_end(info):
    episode = info["episode"]
    delta0 = np.mean(episode.user_data["delta0"])
    delta1 = np.mean(episode.user_data["delta1"])
    price0 = np.mean(episode.user_data["price0"])
    price1 = np.mean(episode.user_data["price1"])
    episode.custom_metrics["delta0"] = delta0
    episode.custom_metrics["delta1"] = delta1
    episode.custom_metrics["price0"] = price0
    episode.custom_metrics["price1"] = price1

### Experiment

In [None]:
ray.init()
trial = tune.run(
        run_or_experiment= 'APEX',
        name='20_cont_DQN',
        stop={"timesteps_total":10**8},
        checkpoint_freq=50,
        #resume=False,
        #num_samples = 2,
        config={
            "env": MultiAgentFirmsPricingContinuous,
            "env_config": ENV_CONFIG,
            "horizon": 100,
            "soft_horizon": True,
            "double_q": True,
            "dueling": True,
            "hiddens": [16],
            "n_step": 3,
            "num_atoms": 10,
            #"noisy": True,
            #"sigma0": 0.5,
            "gamma": 0.975,
            "prioritized_replay": True,
            "prioritized_replay_alpha": 0.5,
            "beta_annealing_fraction": 0.2,
            "final_prioritized_replay_beta": 1.0,
            "learning_starts": 20000,
            "lr":0.0005,
            "adam_epsilon": 0.00015,
            "schedule_max_timesteps": 10**7,
            "exploration_final_eps":0.02,
            "exploration_fraction":0.02,
            "buffer_size": 10**5,
            "target_network_update_freq": 50000,
            "sample_batch_size":16,
            "train_batch_size":64,
            
            "observation_filter": "MeanStdFilter",
            "num_workers": 2,
            "num_envs_per_worker": 8,
            "num_cpus_per_worker": 2,
            #"num_cpus_for_driver": 1,
            "num_gpus":0,
            "multiagent": {
                    "policy_graphs": policy_graphs,
                    "policy_mapping_fn": tune.function(policy_mapping_fn)
            },
            "model": {
                    "fcnet_activation": "tanh",
                    "fcnet_hiddens":[24, 24],
                    },
            "callbacks": {
                    "on_episode_start": tune.function(on_episode_start),
                    "on_episode_step": tune.function(on_episode_step),
                    "on_episode_end": tune.function(on_episode_end),
                    },
            },
    )

2019-08-03 12:31:02,766	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-08-03_12-31-02_766596_4828/logs.
2019-08-03 12:31:02,881	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:52638 to respond...
2019-08-03 12:31:02,999	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:20462 to respond...
2019-08-03 12:31:03,001	INFO services.py:809 -- Starting Redis shard with 2.05 GB max memory.
2019-08-03 12:31:03,017	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-08-03_12-31-02_766596_4828/logs.
2019-08-03 12:31:03,019	INFO services.py:1475 -- Starting the Plasma object store with 3.07 GB memory using /dev/shm.
2019-08-03 12:31:03,117	INFO trial_runner.py:176 -- Starting a new experiment.
W0803 12:31:03.175817 139735645222720 deprecation_wrapper.py:119] From /home/lorenzo/anaconda3/envs/tf-rllib-2/lib/python3.6/site-packages/ray/tune/logger.py:134: The name tf.VERSION is deprecated. Ple

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/6 CPUs, 0/0 GPUs
Memory usage on this node: 2.3/10.2 GB





== Status ==
Using FIFO scheduling algorithm.
Resources requested: 5/6 CPUs, 0/0 GPUs
Memory usage on this node: 2.3/10.2 GB
Result logdir: /home/lorenzo/ray_results/20_cont_DQN
Number of trials: 1 ({'RUNNING': 1})
RUNNING trials:
 - APEX_MultiAgentFirmsPricingContinuous_0:	RUNNING

[2m[36m(pid=4909)[0m W0803 12:31:05.163333 139903247939392 deprecation.py:323] From /home/lorenzo/anaconda3/envs/tf-rllib-2/lib/python3.6/site-packages/tensorflow/python/compat/v2_compat.py:61: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
[2m[36m(pid=4909)[0m Instructions for updating:
[2m[36m(pid=4909)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4909)[0m 2019-08-03 12:31:05,461	INFO rollout_worker.py:310 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=4909)[0m 2019-08-03 12:31:05.462272: I tensorflow/core/platform/cpu_feature_guard.cc:14

W0803 12:31:41.077734 139735645222720 deprecation_wrapper.py:119] From /home/lorenzo/anaconda3/envs/tf-rllib-2/lib/python3.6/site-packages/ray/tune/logger.py:117: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.



Result for APEX_MultiAgentFirmsPricingContinuous_0:
  custom_metrics:
    delta0_max: 1.8191215035274917
    delta0_mean: 1.7617924532577973
    delta0_min: 1.6905625984705683
    delta1_max: -0.32425386503906256
    delta1_mean: -0.5414074408759946
    delta1_min: -0.7103533419550015
    price0_max: 1.6904187543732507
    price0_mean: 1.6458039342856605
    price0_min: 1.6082153076269492
    price1_max: 1.9980574176579369
    price1_mean: 1.9900223016262248
    price1_min: 1.976213030412835
  date: 2019-08-03_12-31-41
  done: false
  episode_len_mean: 100.0
  episode_reward_max: 64.33840156224525
  episode_reward_mean: 59.412821892517314
  episode_reward_min: 55.34838779358367
  episodes_this_iter: 200
  episodes_total: 200
  experiment_id: 95996b9e37e94167993e131b44c649e9
  hostname: lorenzo-VirtualBox
  info:
    learner:
      agent_0:
        cur_lr: 0.0005000000237487257
        mean_td_error: 2.1844780445098877
        model: {}
      agent_1:
        cur_lr: 0.00050000002374872