In [1]:
import numpy as np
import gym
import pandas as pd

import ray
from ray import tune
from ray.rllib.agents.dqn import DQNTrainer

from ray.tune.registry import register_env
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.suggest.bayesopt import BayesOptSearch


In [2]:
class PickLargest(gym.Env):
    def __init__(self):
        self.observation_space = gym.spaces.Box(
            low=float("-inf"), high=float("inf"), shape=(4, ))
        self.action_space = gym.spaces.Discrete(4)

    def reset(self, **kwargs):
        self.obs = np.random.randn(4)
        return self.obs

    def step(self, action):
        reward = self.obs[action]
        return self.obs, reward, True, {}

def env_creator(env_config):
    return PickLargest()

In [3]:
trajs = list()
for trial in range(4):
    ray.init(ignore_reinit_error=True)
    register_env("PickLargest", env_creator)
    agent = DQNTrainer(
        env="PickLargest",
        config={"seed": 666 if trial in [0, 1] else 999})

    trajectory = list()
    for _ in range(3):
        r = agent.train()
        trajectory.append(r["episode_reward_max"])
        trajectory.append(r["episode_reward_min"])
    trajs.append(trajectory)

    ray.shutdown()

2021-11-20 14:47:46,598	INFO trainer.py:753 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2021-11-20 14:47:46,599	INFO dqn.py:143 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2021-11-20 14:47:46,600	INFO trainer.py:772 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [4]:
for i in trajs:
    print(i)

[2.7971520991646552, -2.857962229200296, 3.54639274942917, -2.7120852012876107, 2.9354710995698308, -3.427965628826908]
[2.7971520991646552, -2.857962229200296, 3.54639274942917, -2.7120852012876107, 2.9354710995698308, -3.427965628826908]
[3.1591653784582365, -3.354582245221421, 3.3737197863259074, -3.3048407485994082, 2.941864688280453, -3.5399449458396797]
[3.1591653784582365, -3.354582245221421, 3.3737197863259074, -3.3048407485994082, 2.941864688280453, -3.5399449458396797]


In [5]:
trajs = list()
trajs_actions = list()
trajs_rewards = []
for trial in range(4):
    ray.init(ignore_reinit_error=True)
    
    register_env("PickLargest", env_creator)
    agent = DQNTrainer(
        env="PickLargest",
        config={"seed": 666 if trial in [0, 1] else 999})

    trajectory = list()
    for _ in range(2):
        r = agent.train()
        trajectory.append(r["episode_reward_max"])
        trajectory.append(r["episode_reward_min"])
    trajs.append(trajectory)
    
    actions = []
    rewards = []
    done = False
    env = PickLargest()
    obs = env.reset()
    while not done:
        action = agent.compute_single_action(obs)
        actions.append(action)
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    trajs_actions.append(actions)
    trajs_rewards.append(rewards)
    ray.shutdown()



In [6]:
print(trajs_actions)
print(trajs_rewards)

[[3], [3], [1], [1]]
[[0.41575593666031874], [0.41575593666031874], [1.5788194892097913], [1.5788194892097913]]


In [7]:
trajs_actions = list()
trajs_rewards = []

for trial in range(4):
    ray.init(ignore_reinit_error=True)
    register_env("PickLargest", env_creator)
    
    #rap tune.ray inside
    analysis = tune.run(
        run_or_experiment=DQNTrainer
        ,stop={'timesteps_total' : 3}
        ,config={
            'env': "PickLargest"
            ,"seed" : 666 if trial in [0,1] else 999
#             ,'lr': tune.grid_search([1e-1, 10])
#             ,'vf_clip_param': 1e4
        }
        ,metric = "episode_reward_mean"
        ,mode = 'max'
        ,checkpoint_at_end = True
        ,verbose = 1
    )
    
    agent = DQNTrainer(config=analysis.best_config, env="PickLargest")
    agent.restore(analysis.best_checkpoint)
    
    actions = []
    rewards = []
    done = False
    env = PickLargest()
    obs = env.reset()
    while not done:
        action = agent.compute_single_action(obs)
        actions.append(action)
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    trajs_actions.append(actions)
    trajs_rewards.append(rewards)
    ray.shutdown()  
    

[2m[36m(pid=491958)[0m [2021-11-20 14:50:16,030 E 491958 492064] raylet_client.cc:159: IOError: Broken pipe [RayletClient] Failed to disconnect from raylet.
2021-11-20 14:50:16,133	INFO tune.py:630 -- Total run time: 8.61 seconds (8.22 seconds for the tuning loop).
2021-11-20 14:50:17,712	INFO trainable.py:417 -- Restored on 10.1.0.4 from checkpoint: /home/jing/ray_results/DQN_2021-11-20_14-50-07/DQN_PickLargest_2744e_00000_0_2021-11-20_14-50-07/checkpoint_000001/checkpoint-1
2021-11-20 14:50:17,713	INFO trainable.py:424 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': 0, '_time_total': 2.39831805229187, '_episodes_total': 1000}


In [8]:
print(trajs_actions)
print(trajs_rewards)

[[3], [3], [0], [0]]
[[0.7990245823262375], [0.7990245823262375], [0.43561831700300296], [0.43561831700300296]]
