# Test of wrapped environments

## Normal env

Let's start by training a normal environment 

In [5]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Get the environment and extract the number of actions.

In [6]:
ENV_NAME = 'CartPole-v1'

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

Next, we build a very simple model.

In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics!

In [8]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. You can always safely abort the training prematurely using Ctrl + C.

In [10]:
hist = dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 28.550 seconds


After training is done, we save the final weights.

In [11]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME + "_normal"), overwrite=True)

Finally, evaluate our algorithm for 5 episodes.

In [14]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: 257.000, steps: 257
Episode 2: reward: 227.000, steps: 227
Episode 3: reward: 290.000, steps: 290
Episode 4: reward: 281.000, steps: 281
Episode 5: reward: 244.000, steps: 244


## Wrapped env test

In [16]:
%load_ext autoreload
%autoreload 2

In [17]:
import wenvs

### Simple Q-Learning

Simple q-learning on frozen lake discrete problem

In [25]:
env = gym.make('FrozenLake-v0')
wenv = wenvs.WrapperEnv(env, n_fake_features=2, n_fake_actions=2, n_combinations=1)
env.seed(0);

In [26]:
hist = wenv.run_episode()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG


In [27]:
n_actions = np.prod(wenvs.discrete_space_size(wenv.action_space))
n_state = np.prod(wenvs.discrete_space_size(wenv.observation_space))
cs = (n_state, n_actions) 

In [28]:
q, pi = wenvs.Q_learing(wenv, cs, iterMax=int(1e6)) 

In [29]:
pol = pi.argmax(axis=1)
policy=lambda obs: wenv.decode_act(pol[wenv.encode_obs(obs)])

In [40]:
wenv.run_episode(policy=policy, render=False)[2][-1]

ok

[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
ok
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
ok
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
ok
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
ok
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
ok
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
ok
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
ok
  (Left)
SFFF
[41mF[0mHFH


0.0