In [10]:
from keras_gym.utils import reload_all, feature_vector
reload_all()

from tensorflow import keras
from tensorflow.keras import backend as K

from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT
from keras_gym.value_functions import LinearQ
from keras_gym.algorithms import MonteCarloQ, Reinforce
from keras_gym.policies import ValuePolicy, SoftmaxPolicy
from keras_gym.metrics import SoftmaxPolicyLossWithLogits


env = FrozenLakeEnv(is_slippery=False)

num_features = feature_vector(env.observation_space.sample(), env.observation_space).size
num_actions = env.action_space.n

# behavior policy
Q = LinearQ(env, lr=0.1)
behavior_policy = ValuePolicy(Q)
behavior_algo = MonteCarloQ(Q)

# function approximator for our policy
def create_model():
    # inputs
    X = keras.Input(shape=[num_features])
    advantages = keras.Input(shape=[1])
    
    # computation graph
    dense = keras.layers.Dense(num_actions, kernel_initializer='zeros')
    logits = dense(X)
    
    # loss
    loss_function = SoftmaxPolicyLossWithLogits(advantages)
    
    # the final model
    model = keras.Model(inputs=[X, advantages], outputs=logits)
    model.compile(
        loss=loss_function,
        optimizer=keras.optimizers.SGD(lr=0.1))
    
    return model


# this is the algo we'll develop
model = create_model()
policy = SoftmaxPolicy(env, model)
algo = Reinforce(policy)


s = env.reset()
a = policy.random()


def display_proba(behavior_policy, policy, s):
    actions = dict([(UP, 'up'), (DOWN, 'down'), (LEFT, 'left'), (RIGHT, 'right')])
    
    proba = behavior_policy.proba(s).p
    pmax = np.max(proba)
    print('\nb(a|s={}):'.format(s))
    print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ')
                    for a, p in enumerate(proba)))

    proba = policy.proba(s).p
    pmax = np.max(proba)
    print('\npi(a|s={}):'.format(s))
    print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ')
                    for a, p in enumerate(proba)))
    print()


def run_episode(use_behavior_policy=True, epsilon=0, update=False, render=False):
    s = env.reset()
    done = False
    while not done:
        if render:
            env.render()
            display_proba(behavior_policy, policy, s)
        
        if use_behavior_policy:
            a = behavior_policy.epsilon_greedy(s, epsilon)
        else:
            a = policy.thompson(s)
            
        s_next, r, done, info = env.step(a)
        if update:
            behavior_algo.update(s, a, r, s_next, done)
            algo.update(s, a, r, s_next, done)
        s = s_next
    if render:
        env.render()


for _ in range(200):
    run_episode(epsilon=0.1, update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

b(a|s=0):
  0.238 - left
* 0.288 - down
  0.240 - right
  0.233 - up

pi(a|s=0):
  0.045 - left
* 0.779 - down
  0.138 - right
  0.038 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

b(a|s=4):
  0.241 - left
* 0.300 - down
  0.215 - right
  0.244 - up

pi(a|s=4):
  0.032 - left
* 0.808 - down
  0.126 - right
  0.033 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

b(a|s=8):
  0.262 - left
  0.193 - down
* 0.309 - right
  0.236 - up

pi(a|s=8):
  0.033 - left
  0.135 - down
* 0.802 - right
  0.030 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

b(a|s=9):
  0.227 - left
* 0.304 - down
  0.244 - right
  0.224 - up

pi(a|s=9):
  0.025 - left
* 0.843 - down
  0.110 - right
  0.022 - up

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

b(a|s=13):
  0.206 - left
  0.210 - down
* 0.345 - right
  0.239 - up

pi(a|s=13):
  0.023 - left
  0.110 - down
* 0.844 - right
  0.023 - up

  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG

b(a|s=14):
  0.228 - left
  0.207 - down
* 0.347 - right
  0.218 

In [7]:
from keras_gym.utils import feature_vector

feature_vector(s, env.observation_space).shape

(16,)