# Test of wrapped environments

Generic imports:

In [1]:
import numpy as np
import gym

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import wenvs

Basic configuration:

In [4]:
ENV_CONTINUOUS = 'CartPole-v1'
ENV_DISCRETE = 'FrozenLake-v0'

## Normal env

In [16]:
from stable_baselines.common import set_global_seeds
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import A2C, PPO2

Learn on usual environment

In [11]:
env = gym.make('CartPole-v1')
wenv = wenvs.WrapperEnv(env, continuous_state=True)
np.random.seed(0);
wenv.seed(0);
vec_env = DummyVecEnv([lambda: wenv])  # The algorithms require a vectorized environment to run

In [12]:
model = PPO2(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=10000)

Run episode of normal environment

In [13]:
policy = lambda obs: model.predict(obs)[0]
hist = wenv.run_episode(policy=policy)

Finally, evaluate our algorithm and plot.

In [14]:
hist[2].sum()

148.0

## Wrapped env test

Magic to develop custom wrapper: 

### Simple Q-Learning

Simple q-learning on frozen lake discrete problem

In [51]:
env = gym.make(ENV_DISCRETE)
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1)
np.random.seed(0);
wenv.seed(0);

Run episode

In [197]:
hist = wenv.run_episode()
hist[2].sum()

0.0

Tabular learning for discrete spaces

In [108]:
state_sizes = wenvs.discrete_space_size(wenv.observation_space)
act_sizes = wenvs.discrete_space_size(wenv.action_space)
n_actions = np.prod(act_sizes)
n_state = np.prod(state_sizes)
n_obs = len(state_sizes)
n_act = len(act_sizes)
cs = (n_state, n_actions) 

In [226]:
q, pi = wenvs.Q_learing(wenv, cs, iterMax=int(1e5)) # eventually %reset array
policy = lambda obs: wenv.decode_act(pi[wenv.encode_obs(obs)])

Evalutaion

In [239]:
wenv.run_episode(policy=policy)[2].sum()

0.0

Alternative function test

In [199]:
q, pi = wenvs.Q_learing_multidim(wenv, state_sizes, act_sizes, iterMax=int(1e5))
policy_dim = lambda obs: pi[(*obs,)]

In [225]:
wenv.run_episode(policy=policy_dim)[2].sum()

0.0

Proper policy evaluation

### Baselines

Training with state of the art algorithm for continuous spaces

In [107]:
#plot
#from stable_baselines.bench import Monitor
#from stable_baselines.results_plotter import load_results, ts2xy
#x, y = ts2xy(load_results(log_folder), 'timesteps') #time, reward

In [102]:
env = gym.make(ENV_DISCRETE)
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1)
np.random.seed(0)
wenv.seed(0);
vec_env = DummyVecEnv([lambda: wenv])

In [103]:
hist = wenv.run_episode()
hist[2].sum()

0.0

Plot of training

In [104]:
model = PPO2(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=10000)

Evalutaion and plot

In [105]:
policy = lambda obs: model.predict(obs)[0]
hist = wenv.run_episode(policy=policy)

In [106]:
hist[2].sum()

0.0

#### MPI

In [58]:
def make_wenv(env_id, rank, seed=0, continuous_state=False, continuous_actions=False):
    def _init():
        env = gym.make(env_id)
        wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1, continuous_state=continuous_state, continuous_actions=continuous_actions)
        wenv.seed(seed + rank)
        return wenv
    set_global_seeds(seed)
    return _init

In [93]:
wenv = make_wenv(ENV_CONTINUOUS, 0, seed=1000, continuous_state=True)()

In [94]:
vec_env = SubprocVecEnv([make_wenv(ENV_CONTINUOUS, i, continuous_state=True) for i in range(4)])

In [95]:
model = A2C(MlpPolicy, vec_env)
hist = model.learn(total_timesteps=50000)

In [96]:
def policy(obs): 
    return model.predict(obs)[0]
# vec_env.env_method doesn't work but needs function not lambda

In [99]:
hist = wenv.run_episode(policy=policy, render=True)

In [101]:
hist[2].sum()

200.0