# Reinforcement Learning Final Project

In [1]:
import flow.networks as networks

print(networks.__all__)

['Network', 'BayBridgeNetwork', 'BayBridgeTollNetwork', 'BottleneckNetwork', 'FigureEightNetwork', 'TrafficLightGridNetwork', 'HighwayNetwork', 'RingNetwork', 'MergeNetwork', 'MultiRingNetwork', 'MiniCityNetwork', 'HighwayRampsNetwork', 'I210SubNetwork']


In [2]:
from flow.networks import RingNetwork

# ring road network class
network_name = RingNetwork

In [3]:
# input parameter classes to the network class
from flow.core.params import NetParams, InitialConfig

# name of the network
name = "training_example"

# network-specific parameters
from flow.networks.ring import ADDITIONAL_NET_PARAMS
net_params = NetParams(additional_params=ADDITIONAL_NET_PARAMS)

# initial configuration to vehicles
initial_config = InitialConfig(spacing="uniform", perturbation=1)

### Adding Trainable Autonomous Vehicles

In [4]:
# vehicles class
from flow.core.params import VehicleParams

# vehicles dynamics models
from flow.controllers import IDMController, ContinuousRouter

vehicles = VehicleParams()
vehicles.add("human",
             acceleration_controller=(IDMController, {}),
             routing_controller=(ContinuousRouter, {}),
             num_vehicles=21)

The above addition to the `Vehicles` class only accounts for 21 of the 22 vehicles that are placed in the network. We now add an additional trainable autuonomous vehicle whose actions are dictated by an RL agent. This is done by specifying an `RLController` as the acceleraton controller to the vehicle. 

In [5]:
from flow.controllers import RLController

In [6]:
vehicles.add(veh_id="rl",
             acceleration_controller=(RLController, {}),
             routing_controller=(ContinuousRouter, {}),
             num_vehicles=1)

## Setting up an Environment

In [7]:
from flow.core.params import SumoParams

sim_params = SumoParams(sim_step=0.1, render=False)

### EnvParams

`EnvParams` specifies environment and experiment-specific parameters that either affect the training process or the dynamics of various components within the network. For the environment `WaveAttenuationPOEnv`, these parameters are used to dictate bounds on the accelerations of the autonomous vehicles, as well as the range of ring lengths (and accordingly network densities) the agent is trained on.

Finally, it is important to specify here the *horizon* of the experiment, which is the duration of one episode (during which the RL-agent acquire data). 

In [8]:
from flow.core.params import EnvParams

# Define horizon as a variable to ensure consistent use across notebook
HORIZON=100   # change this in order to change the dataset size

env_params = EnvParams(
    # length of one rollout
    horizon=HORIZON,

    additional_params={
        # maximum acceleration of autonomous vehicles
        "max_accel": 1,
        # maximum deceleration of autonomous vehicles
        "max_decel": 1,
        # bounds on the ranges of ring road lengths the autonomous vehicle 
        # is trained on
        "ring_length": [220, 270],
    },
)

### Initializing a Gym Environment

In [9]:
import flow.envs as flowenvs

print(flowenvs.__all__)

['Env', 'AccelEnv', 'LaneChangeAccelEnv', 'LaneChangeAccelPOEnv', 'TrafficLightGridTestEnv', 'MergePOEnv', 'BottleneckEnv', 'BottleneckAccelEnv', 'WaveAttenuationEnv', 'WaveAttenuationPOEnv', 'TrafficLightGridEnv', 'TrafficLightGridPOEnv', 'TrafficLightGridBenchmarkEnv', 'BottleneckDesiredVelocityEnv', 'TestEnv', 'BayBridgeEnv', 'BottleNeckAccelEnv', 'DesiredVelocityEnv', 'PO_TrafficLightGridEnv', 'GreenWaveTestEnv']


We will use the environment "WaveAttenuationPOEnv", which is used to train autonomous vehicles to attenuate the formation and propagation of waves in a partially observable variable density ring road. To create the Gym Environment, the only necessary parameters are the environment name plus the previously defined variables. These are defined as follows:

In [10]:
from flow.envs import WaveAttenuationEnv
env_name = WaveAttenuationEnv          # when use DDPG Algo

#env_name = WaveAttenuationPOEnv       # when use PPO Algo

In [11]:
# Creating flow_params. Make sure the dictionary keys are as specified. 
flow_params = dict(
    # name of the experiment
    exp_tag=name,
    # name of the flow environment the experiment is running on
    env_name=env_name,
    # name of the network class the experiment uses
    network=network_name,
    # simulator that is used by the experiment
    simulator='traci',
    # simulation-related parameters
    sim=sim_params,
    # environment related parameters (see flow.core.params.EnvParams)
    env=env_params,
    # network-related parameters (see flow.core.params.NetParams and
    # the network's documentation or ADDITIONAL_NET_PARAMS component)
    net=net_params,
    # vehicles to be placed in the network at the start of a rollout 
    # (see flow.core.vehicles.Vehicles)
    veh=vehicles,
    # (optional) parameters affecting the positioning of vehicles upon 
    # initialization/reset (see flow.core.params.InitialConfig)
    initial=initial_config
)

## Running RL experiments

In [12]:
import json

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

### Initializing Ray
Here, we initialize Ray and experiment-based constant variables specifying parallelism in the experiment as well as experiment batch size in terms of number of rollouts.

In [13]:
# number of parallel workers
N_CPUS = 2 # check the cpu core number of your PC before running it 
# number of rollouts per training iteration
N_ROLLOUTS = 1  # please change this number to increase the iterations 

ray.init(num_cpus=N_CPUS)

2022-05-26 00:58:39,492	INFO resource_spec.py:216 -- Starting Ray with 6.2 GiB memory available for workers and up to 3.1 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '172.25.7.78',
 'redis_address': '172.25.7.78:51518',
 'object_store_address': '/tmp/ray/session_2022-05-26_00-58-39_471097_408/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-05-26_00-58-39_471097_408/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-05-26_00-58-39_471097_408'}

### Configuration and Setup
Here, we copy and modify the default configuration for the [PPO algorithm](https://arxiv.org/abs/1707.06347). The agent has the number of parallel workers specified, a batch size corresponding to `N_ROLLOUTS` rollouts (each of which has length `HORIZON` steps), a discount rate $\gamma$ of 0.999, two hidden layers of size 16, uses Generalized Advantage Estimation, $\lambda$ of 0.97, and other parameters as set below.

Once `config` contains the desired parameters, a JSON string corresponding to the `flow_params` specified in section 3 is generated. The `FlowParamsEncoder` maps objects to string representations so that the experiment can be reproduced later. That string representation is stored within the `env_config` section of the `config` dictionary. Later, `config` is written out to the file `params.json`. 

Next, we call `make_create_env` and pass in the `flow_params` to return a function we can use to register our Flow environment with Gym. 

# RESTART THE KENERAL AND run the following algorithm Separately. DON NOT RUN THEM TOGETHER IN ONE SESSION.

# DQN Algo

In [30]:
# Please adjust the parameters in this block to make the model converge
# to change the parameters, please check https://docs.ray.io/en/releases-0.8.1/rllib-algorithms.html

alg_run = "DDPG"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS - 1  # number of parallel workers
config["gamma"] = 0.999  # discount rate


# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

# PPO Algo

In [43]:
# to change the parameters, please check https://docs.ray.io/en/releases-0.8.1/rllib-algorithms.html

alg_run = "PPO"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS - 1  # number of parallel workers
config["train_batch_size"] = HORIZON * N_ROLLOUTS  # batch size
config["gamma"] = 0.999  # discount rate
config["model"].update({"fcnet_hiddens": [16, 16]})  # size of hidden layers in network
config["use_gae"] = True  # using generalized advantage estimation
config["lambda"] = 0.97  
config["sgd_minibatch_size"] = min(16 * 1024, config["train_batch_size"])  # stochastic gradient descent
config["kl_target"] = 0.02  # target KL divergence
config["num_sgd_iter"] = 10  # number of SGD iterations
config["horizon"] = HORIZON  # rollout horizon

# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)



### Training

In [31]:
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,
        "env": gym_name,
        "config": {
            **config
        },
        "checkpoint_freq": 1,  # number of iterations between checkpoints
        "checkpoint_at_end": True,  # generate a checkpoint at the end
        "max_failures": 999,
        "stop": {  # stopping conditions
            "training_iteration": 1,  # number of iterations to stop after
        },
    },
})

2022-05-25 22:42:19,365	INFO ray_trial_executor.py:121 -- Trial DDPG_WaveAttenuationEnv-v0_d804742a: Setting up new remote runner.


Trial name,status,loc
DDPG_WaveAttenuationEnv-v0_d804742a,RUNNING,


[2m[36m(pid=4184)[0m 2022-05-25 22:42:21,495	INFO trainer.py:371 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=4184)[0m 2022-05-25 22:42:22,460	INFO trainer.py:512 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


### Visualizing the results

The simulation results are saved within the `ray_results/training_example` directory (we defined `training_example` at the start of this tutorial). The `ray_results` folder is by default located at your root `~/ray_results`. 

You can run `tensorboard --logdir=~/ray_results/training_example` (install it with `pip install tensorboard`) to visualize the different data outputted by your simulation.

For more instructions about visualizing, please see `tutorial05_visualize.ipynb`. 

### Restart from a checkpoint / Transfer learning

If you wish to do transfer learning, or to resume a previous training, you will need to start the simulation from a previous checkpoint. To do that, you can add a `restore` parameter in the `run_experiments` argument, as follows:

```python
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,
        "env": gym_name,
        "config": {
            **config
        },
        "restore": "/ray_results/experiment/dir/checkpoint_50/checkpoint-50"
        "checkpoint_freq": 1,
        "checkpoint_at_end": True,
        "max_failures": 999,
        "stop": {
            "training_iteration": 1,
        },
    },
})
```

The `"restore"` path should be such that the `[restore]/.tune_metadata` file exists.

There is also a `"resume"` parameter that you can set to `True` if you just wish to continue the training from a previously saved checkpoint, in case you are still training on the same experiment. 