# Test of wrapped environments

Generic imports:

In [1]:
import numpy as np
import gym

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src import wenvs

Basic configuration:

In [4]:
ENV_CONTINUOUS = 'CartPole-v1'
ENV_DISCRETE = 'FrozenLake-v0'

## Normal env

In [5]:
from stable_baselines.common import set_global_seeds
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import A2C, PPO2

Learn on usual environment

In [6]:
env = gym.make(ENV_CONTINUOUS)
wenv = wenvs.WrapperEnv(env, continuous_state=True)
np.random.seed(0);
wenv.seed(0);
vec_env = DummyVecEnv([lambda: wenv])  # The algorithms require a vectorized environment to run

In [7]:
model = PPO2(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=50000)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Run episode of normal environment

In [8]:
policy = lambda obs: model.predict(obs)[0]
hist = wenv.run_episode(policy=policy, render=True)
wenv.close()

Finally, evaluate our algorithm and plot.

In [9]:
hist[2].sum()

200.0

## Wrapped env test

Magic to develop custom wrapper: 

### Simple Q-Learning

Simple q-learning on frozen lake discrete problem

In [10]:
# arbitary features

fun_list = [lambda obs: obs**2, lambda obs: obs+2]
fun_space = [16**2 + 1, 16+2]

In [11]:
env = gym.make(ENV_DISCRETE)
wenv = wenvs.WrapperEnv(env, n_fake_features=1, n_fake_actions=2, 
                        fun_list=fun_list, fun_discrete_sizes=fun_space)
np.random.seed(0);
wenv.seed(0);

Run episode

In [12]:
hist = wenv.run_episode()
hist[2].sum()

0.0

Tabular learning for discrete spaces

In [13]:
state_sizes = wenvs.discrete_space_size(wenv.observation_space)
act_sizes = wenvs.discrete_space_size(wenv.action_space)
n_actions = np.prod(act_sizes)
n_state = np.prod(state_sizes)
n_obs = len(state_sizes)
n_act = len(act_sizes)
cs = (n_state, n_actions)

In [14]:
q, pi = wenvs.Q_learing(wenv, cs, iterMax=int(1e5)) # eventually %reset array
policy = lambda obs: wenv.decode_act(pi[wenv.encode_obs(obs)])

Evalutaion

In [15]:
wenv.run_episode(policy=policy)[2].sum()

0.0

Alternative function test

In [16]:
q, pi = wenvs.Q_learing_multidim(wenv, state_sizes, act_sizes, iterMax=int(1e5))
policy_dim = lambda obs: pi[(*obs,)]

In [17]:
wenv.run_episode(policy=policy_dim)[2].sum()

0.0

Proper policy evaluation

### Baselines

Training with state of the art algorithm for continuous spaces

In [18]:
#plot
#from stable_baselines.bench import Monitor # env = Monitor(env, log_dir)
#from stable_baselines.results_plotter import load_results, ts2xy, plot_results
#x, y = ts2xy(load_results(log_folder), 'timesteps') #time, reward

In [19]:
env = gym.make(ENV_DISCRETE)
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2)
np.random.seed(0)
wenv.seed(0);
vec_env = DummyVecEnv([lambda: wenv])

In [20]:
hist = wenv.run_episode()
hist[2].sum()

0.0

In [21]:
model = PPO2(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=10000)

Evalutaion and plot

In [22]:
policy = lambda obs: model.predict(obs)[0]
hist = wenv.run_episode(policy=policy)

In [23]:
hist[2].sum()

0.0

#### MPI

In [24]:
def make_wenv(env_id, rank, seed=0, continuous_state=False, continuous_actions=False):
    def _init():
        env = gym.make(env_id)
        wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, continuous_state=continuous_state, continuous_actions=continuous_actions)
        wenv.seed(seed + rank)
        return wenv
    set_global_seeds(seed)
    return _init

In [25]:
wenv = make_wenv(ENV_CONTINUOUS, 0, seed=1000, continuous_state=True)()

In [26]:
vec_env = SubprocVecEnv([make_wenv(ENV_CONTINUOUS, i, continuous_state=True) for i in range(4)])

In [27]:
model = A2C(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=50000)

In [28]:
def policy(obs): 
    return model.predict(obs)[0]
# vec_env.env_method doesn't work but needs function not lambda

In [29]:
hist = wenv.run_episode(policy=policy, render=True)
wenv.close()

In [30]:
hist[2].sum()

101.0