# Test of wrapped environments

Generic imports:

In [1]:
import numpy as np
import gym

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import wenvs

Basic configuration:

In [4]:
ENV_CONTINUOUS = 'CartPole-v1'
ENV_DISCRETE = 'Taxi-v2'

## Normal env

In [19]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import DDPG, A2C, PPO2

Learn on usual environment

In [21]:
env = gym.make('CartPole-v1')
wenv = wenvs.WrapperEnv(env, continuous_state=True)
np.random.seed(0);
wenv.seed(0);
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

In [23]:
model = PPO2(MlpPolicy, env)
hist = model.learn(total_timesteps=10000)

Run episode of normal environment

In [31]:
policy = lambda obs: model.predict(obs)[0]
hist = wenv.run_episode(policy=policy)

Finally, evaluate our algorithm and plot.

In [40]:
hist[2].sum()

-776

## Wrapped env test

Magic to develop custom wrapper: 

### Simple Q-Learning

Simple q-learning on frozen lake discrete problem

In [33]:
env = gym.make(ENV_DISCRETE)
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1)
np.random.seed(0);
wenv.seed(0);

Run episode

In [34]:
hist = wenv.unwrapped.run_episode()
hist[2].sum()

Tabular learning for discrete spaces

In [36]:
state_sizes = wenvs.discrete_space_size(wenv.observation_space)
act_sizes = wenvs.discrete_space_size(wenv.action_space)
n_actions = np.prod(act_sizes)
n_state = np.prod(state_sizes)
n_obs = len(state_sizes)
n_act = len(act_sizes)
cs = (n_state, n_actions) 

In [37]:
q, pi = wenvs.Q_learing(wenv, cs, iterMax=int(1e5)) # eventually %reset array
policy = lambda obs: wenv.decode_act(pi[wenv.encode_obs(obs)])

Evalutaion

In [39]:
wenv.unwrapped.run_episode(policy=policy, render=False)[2].sum()

-200

Alternative function test

In [14]:
q, pi = wenvs.Q_learing_multidim(wenv, state_sizes, act_sizes, iterMax=int(1e5))
policy_dim = lambda obs: tuple(pi[(*obs,)])

In [16]:
wenv.run_episode(policy=policy_dim, render=False)[2].sum()

-200

Proper policy evaluation

### Baselines

Training with state of the art algorithm for continuous spaces

In [17]:
env = gym.make(ENV_CONTINUOUS)
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1, continuous_state=True)
np.random.seed(0)
wenv.seed(0);

Plot fo training

Evalutaion and plot