# CDS-RL Demos
A place to experiment with RL techniques on various systems that are well-understood by CDS

Note: set up a conda environment according to setup_instructions.txt 

In [None]:
import matplotlib
%matplotlib notebook

# Setup outputs
output_dir = '/Users/nataliebernat/Documents/Github/Research/Doyle_RLandCtrl/RL_outputs'
fig_dir = output_dir+'/figures/'


## Sanity check: Spinningup

Setup

In [None]:
from spinup import ppo, vpg
from spinup.utils import plot
import tensorflow as tf
import gym
import matplotlib.pyplot as plt

import matplotlib
%matplotlib notebook

# Setup outputs
output_dir = '/Users/nataliebernat/Documents/Github/Research/Doyle_RLandCtrl/RL_outputs'
fig_dir = output_dir+'/figures/'

In [None]:
env_fn = lambda : gym.make('LunarLander-v2')
ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu)
num_steps_per_epoch=5000
num_epochs = 100 #(this will take like 5-10min per experiment)

Run each exp (saves outputs automatically)

In [None]:
exp_name = 'vpg_test'
logger_kwargs = dict(output_dir=output_dir+'/spinup_tests/'+exp_name+'/', exp_name=exp_name)
vpg(env_fn=env_fn, steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, logger_kwargs=logger_kwargs)

In [None]:
exp_name = 'ppo_test'

tf.reset_default_graph()
logger_kwargs = dict(output_dir=output_dir+'/spinup_tests/'+exp_name+'/', exp_name=exp_name)
ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, logger_kwargs=logger_kwargs)

Plot the learning curves (Sometimes this crashes the kernel, but the data is saved to disk so just try this again)

In [None]:
#plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs)
data = plot.get_all_datasets([output_dir+'/spinup_tests'], None, None, None)
plt.figure()
plot.plot_data(data)

## Sanity Check: Stable-Baselines

Learn policy for cartpole; learning displays; render videos

In [None]:
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

import matplotlib
%matplotlib notebook

# Setup outputs
output_dir = '/Users/nataliebernat/Documents/Github/Research/Doyle_RLandCtrl/RL_outputs'
fig_dir = output_dir+'/figures/'

In [None]:
env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

model = PPO2(MlpPolicy, env, verbose=1)

In [None]:
model.learn(total_timesteps=100)

In [None]:
# NOTE: This is very slow.... Eventually make a wrapper for this, and just generate a video and then play the video

obs = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once
for i in range(1000):
    img.set_data(env.render(mode='rgb_array')) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    

## CDS Alpha experiments

### Large batch of seeded expts w/ white noise

In [8]:
import numpy as np
from spinup import ppo, vpg
from spinup.utils import plot
import tensorflow as tf
import gym
import gym_cdsalpha
import matplotlib.pyplot as plt
import os

import matplotlib
%matplotlib notebook

def make_dir_strs(test_group, reg, horiz, seed, sigma):
    reg_str = 'reg{0:.0e}'.format(reg).replace('.','pt')
    horiz_str = 'horiz{0}'.format(horiz)
    if seed==None:
        seed_str = 'unseeded'
    else:
        seed_str = 'seed{0}'.format(seed)

    if sigma==0:
        noise_str = 'nonoise'
    else:
        noise_str = 'whitenoise{0:.1f}'.format(sigma).replace('.','pt')

    # Setup outputs
    out_dir = '/Users/nataliebernat/Documents/Github/Research/Doyle_RLandCtrl/RL_outputs'
    fig_dir = out_dir+'/figures'
    exp_out_dir = '{0}/{1}/{2}/{3}/{4}/'.format(out_dir,test_group,horiz_str+reg_str,noise_str,seed_str).replace('.','pt')
    exp_fig_dir = '{0}/{1}/{2}/{3}/{4}/'.format(fig_dir,test_group,horiz_str+reg_str,noise_str,seed_str).replace('.','pt')
    
    return exp_out_dir, exp_fig_dir

In [None]:
# # # Train # # #

# Setup experiment parameters
test_group = 'normrew_tests'
do_vpg = True
do_ppo = False
seed = 1
sigma = 0.4
regs = [0,1]        #[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
horizs = [50, 500]
alphas = [0.0, 0.1, 0.2, 0.4, 0.8, 1.0]


if not os.path.exists(exp_fig_dir):
    os.makedirs(exp_fig_dir)

# Setup learning parameters
num_steps_per_epoch=5000
num_epochs = 100 #(this will take like 5-10min per experiment, longer when it performs poorly)

for reg in regs:
    for horiz in horizs:
        for alpha in alphas:
            
            exp_out_dir, exp_fig_dir = make_dir_strs(test_group, reg, horiz, seed, sigma)
            env_fn = lambda : gym.make('cdsalpha-v1', a=alpha, sigma=sigma, seed=seed, reg=reg, horiz=horiz)

            if do_vpg:
                exp_name = 'alpha{0:.1f}_vpg_test'.format(alpha).replace('.','pt')
                tf.reset_default_graph()
                logger_kwargs = dict(output_dir=exp_out_dir+exp_name+'/', exp_name=exp_name)
                vpg(env_fn=env_fn, steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, logger_kwargs=logger_kwargs)

            if do_ppo:
                exp_name = 'alpha{0:.1f}_ppo_test'.format(alpha).replace('.','pt')
                tf.reset_default_graph()
                logger_kwargs = dict(output_dir=exp_out_dir+exp_name+'/', exp_name=exp_name)
                ppo(env_fn=env_fn, steps_per_epoch=num_steps_per_epoch, epochs=num_epochs, logger_kwargs=logger_kwargs)

[32;1mLogging data to /Users/nataliebernat/Documents/Github/Research/Doyle_RLandCtrl/RL_outputs/normrew_tests/horiz5reg1e+00/whitenoise0pt4/seed1/alpha0pt0_vpg_test/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "action_space":	{
            "Box(1,)":	{
                "bounded_above":	"[ True]",
                "bounded_below":	"[ True]",
                "dtype":	"float32",
                "high":	"[100.]",
                "low":	"[-100.]",
                "np_random":	"<mtrand.RandomState object at 0x1c36a70438>",
                "shape":	[
                    1
                ]
            }
        }
    },
    "actor_critic":	"mlp_actor_critic",
    "env_fn":	"<function <lambda> at 0x1c411dad90>",
    "epochs":	100,
    "exp_name":	"alpha0pt0_vpg_test",
    "gamma":	0.99,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x102824e10>":	{
            "epoch_dict":	{},
            "exp_name":	"alpha0pt0_vpg_test",
       

---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -2.2 |
|          StdEpRet |            1.41 |
|          MaxEpRet |          -0.248 |
|          MinEpRet |             -13 |
|             EpLen |               5 |
|      AverageVVals |            -1.7 |
|          StdVVals |          0.0345 |
|          MaxVVals |           -1.51 |
|          MinVVals |           -1.84 |
| TotalEnvInteracts |         4.5e+04 |
|            LossPi |          -0.256 |
|             LossV |            1.71 |
|       DeltaLossPi |        -0.00724 |
|        DeltaLossV |         -0.0063 |
|           Entropy |           0.937 |
|                KL |         0.00152 |
|              Time |            49.7 |
---------------------------------------
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |           -2.13 |
|          StdEpRet |             1.3 |
|          MaxEpRet |          -0.179 |


---------------------------------------
|             Epoch |              18 |
|      AverageEpRet |           -1.98 |
|          StdEpRet |            1.12 |
|          MaxEpRet |          -0.188 |
|          MinEpRet |           -7.12 |
|             EpLen |               5 |
|      AverageVVals |           -1.53 |
|          StdVVals |           0.257 |
|          MaxVVals |           -1.32 |
|          MinVVals |            -3.4 |
| TotalEnvInteracts |         9.5e+04 |
|            LossPi |          -0.266 |
|             LossV |           0.986 |
|       DeltaLossPi |        0.000473 |
|        DeltaLossV |         -0.0112 |
|           Entropy |           0.915 |
|                KL |       -0.000211 |
|              Time |              98 |
---------------------------------------
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |           -2.05 |
|          StdEpRet |            1.15 |
|          MaxEpRet |          -0.146 |


---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |           -2.01 |
|          StdEpRet |             1.1 |
|          MaxEpRet |          -0.213 |
|          MinEpRet |           -7.77 |
|             EpLen |               5 |
|      AverageVVals |           -1.56 |
|          StdVVals |           0.331 |
|          MaxVVals |           -1.11 |
|          MinVVals |           -3.95 |
| TotalEnvInteracts |        1.45e+05 |
|            LossPi |          -0.281 |
|             LossV |           0.921 |
|       DeltaLossPi |       -0.000246 |
|        DeltaLossV |         -0.0168 |
|           Entropy |           0.916 |
|                KL |       -0.000113 |
|              Time |             149 |
---------------------------------------
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |           -1.99 |
|          StdEpRet |            1.08 |
|          MaxEpRet |          -0.161 |


---------------------------------------
|             Epoch |              38 |
|      AverageEpRet |           -1.99 |
|          StdEpRet |            1.08 |
|          MaxEpRet |           -0.22 |
|          MinEpRet |           -6.39 |
|             EpLen |               5 |
|      AverageVVals |           -1.57 |
|          StdVVals |            0.43 |
|          MaxVVals |          -0.998 |
|          MinVVals |           -4.55 |
| TotalEnvInteracts |        1.95e+05 |
|            LossPi |          -0.282 |
|             LossV |           0.836 |
|       DeltaLossPi |       -0.000484 |
|        DeltaLossV |         -0.0105 |
|           Entropy |           0.909 |
|                KL |        1.94e-05 |
|              Time |             198 |
---------------------------------------
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |           -2.01 |
|          StdEpRet |            1.05 |
|          MaxEpRet |          -0.139 |


---------------------------------------
|             Epoch |              48 |
|      AverageEpRet |           -1.96 |
|          StdEpRet |             1.1 |
|          MaxEpRet |           -0.13 |
|          MinEpRet |           -6.93 |
|             EpLen |               5 |
|      AverageVVals |           -1.51 |
|          StdVVals |            0.43 |
|          MaxVVals |              -1 |
|          MinVVals |           -4.83 |
| TotalEnvInteracts |        2.45e+05 |
|            LossPi |          -0.272 |
|             LossV |           0.844 |
|       DeltaLossPi |       -0.000231 |
|        DeltaLossV |        -0.00408 |
|           Entropy |           0.897 |
|                KL |       -3.34e-05 |
|              Time |             250 |
---------------------------------------
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |           -1.96 |
|          StdEpRet |            1.05 |
|          MaxEpRet |          -0.186 |


---------------------------------------
|             Epoch |              58 |
|      AverageEpRet |           -1.98 |
|          StdEpRet |            1.09 |
|          MaxEpRet |          -0.169 |
|          MinEpRet |           -7.05 |
|             EpLen |               5 |
|      AverageVVals |           -1.54 |
|          StdVVals |           0.411 |
|          MaxVVals |           -1.07 |
|          MinVVals |           -4.35 |
| TotalEnvInteracts |        2.95e+05 |
|            LossPi |          -0.272 |
|             LossV |           0.842 |
|       DeltaLossPi |       -0.000189 |
|        DeltaLossV |        -0.00733 |
|           Entropy |           0.909 |
|                KL |       -1.03e-05 |
|              Time |             299 |
---------------------------------------
---------------------------------------
|             Epoch |              59 |
|      AverageEpRet |           -1.92 |
|          StdEpRet |            1.04 |
|          MaxEpRet |          -0.108 |


---------------------------------------
|             Epoch |              68 |
|      AverageEpRet |           -1.94 |
|          StdEpRet |            1.05 |
|          MaxEpRet |          -0.228 |
|          MinEpRet |            -8.2 |
|             EpLen |               5 |
|      AverageVVals |           -1.49 |
|          StdVVals |            0.43 |
|          MaxVVals |          -0.996 |
|          MinVVals |           -4.59 |
| TotalEnvInteracts |        3.45e+05 |
|            LossPi |          -0.284 |
|             LossV |             0.8 |
|       DeltaLossPi |       -0.000424 |
|        DeltaLossV |        -0.00972 |
|           Entropy |           0.891 |
|                KL |        6.92e-05 |
|              Time |             347 |
---------------------------------------
---------------------------------------
|             Epoch |              69 |
|      AverageEpRet |           -1.92 |
|          StdEpRet |            1.07 |
|          MaxEpRet |          -0.182 |


---------------------------------------
|             Epoch |              78 |
|      AverageEpRet |           -1.94 |
|          StdEpRet |            1.09 |
|          MaxEpRet |          -0.265 |
|          MinEpRet |           -12.3 |
|             EpLen |               5 |
|      AverageVVals |            -1.5 |
|          StdVVals |           0.434 |
|          MaxVVals |          -0.921 |
|          MinVVals |           -6.32 |
| TotalEnvInteracts |        3.95e+05 |
|            LossPi |          -0.288 |
|             LossV |           0.828 |
|       DeltaLossPi |       -0.000173 |
|        DeltaLossV |         -0.0137 |
|           Entropy |           0.894 |
|                KL |       -4.18e-05 |
|              Time |             397 |
---------------------------------------
---------------------------------------
|             Epoch |              79 |
|      AverageEpRet |           -1.98 |
|          StdEpRet |            1.12 |
|          MaxEpRet |          -0.202 |


---------------------------------------
|             Epoch |              88 |
|      AverageEpRet |            -1.9 |
|          StdEpRet |            1.04 |
|          MaxEpRet |          -0.127 |
|          MinEpRet |           -6.23 |
|             EpLen |               5 |
|      AverageVVals |           -1.49 |
|          StdVVals |           0.421 |
|          MaxVVals |          -0.989 |
|          MinVVals |           -5.19 |
| TotalEnvInteracts |        4.45e+05 |
|            LossPi |          -0.279 |
|             LossV |           0.779 |
|       DeltaLossPi |       -0.000165 |
|        DeltaLossV |        -0.00949 |
|           Entropy |           0.883 |
|                KL |       -2.78e-05 |
|              Time |             479 |
---------------------------------------
---------------------------------------
|             Epoch |              89 |
|      AverageEpRet |           -1.92 |
|          StdEpRet |            1.06 |
|          MaxEpRet |          -0.159 |


In [None]:
# # # Plot Training # # #

#plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs)
seed = 1
sigma = 0.1 #[0.1, 0.4]
regs = [1]#, 0.1, 0.01, 0.001, 0.0001, 0.00001]
alphas = [0.0, 0.1, 0.2, 0.4, 0.8, 1.0]
horiz = 5
test_group = 'normrew_tests'

for reg in regs:
    
    for alpha in alphas:
        exp_out_dir, exp_fig_dir = make_dir_strs(test_group, reg, horiz, seed, sigma)
        test_name = 'alpha{0:.1f}'.format(alpha).replace('.','pt')
        data = plot.get_all_datasets([exp_out_dir+test_name], None, None, None)
        plt.figure()
        plt.title('{0},{1},{2},{3}'.format(test_name,noise_str,horiz_str,reg_str))
        plot.plot_data(data)
        plt.ylim((-5, 0)) #bottom, top
        plt.savefig(exp_fig_dir+test_name)#+'_closeup')

In [None]:
# # # Test # # #

from spinup.utils.test_policy import load_policy, run_policy

alpha = 0.0
sigma = 0.4
seed = 1
train_reg
test_reg = 0
train_horiz = 5
test_horiz = 100 # Longer for testing
test_group = 'normrew_tests'
alg_type = 'vpg'

exp_out_dir, exp_fig_dir = make_dir_strs(test_group, train_reg, train_horiz, seed, sigma)
test_name = 'alpha{0:.1f}_{1}_test/'.format(alpha, alg_type).replace('.','pt')

_, get_action = load_policy(exp_out_dir+test_name)
env = gym.make('cdsalpha-v1', a=alpha, sigma=sigma, seed=seed, reg=test_reg, horiz=test_horiz)
run_policy(env, get_action, render=False)