# FrozenLake (non-slippery) with Monte Carlo methods

- [REINFORCE (Monte Carlo policy gradient)](REINFORCE-(Monte-Carlo-policy-gradient))
- [Value-based control: MonteCarloQ](#Value-based-control:-MonteCarloQ)
- [Actor-Critic with MonteCarloV](#Actor-Critic-with-MonteCarloV)

# REINFORCE (Monte Carlo policy gradient)

In [1]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearQ, LinearV
from keras_gym.algorithms import Reinforce, NStepBootstrapV
from keras_gym.policies import LinearSoftmaxPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# softmax policy
policy = LinearSoftmaxPolicy(env, lr=0.1)
algo = Reinforce(policy)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrapV(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            algo_v.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.701
pi(a|s=0):
  0.007 - left
* 0.911 - down
  0.078 - right
  0.004 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.792
pi(a|s=4):
  0.008 - left
* 0.936 - down
  0.050 - right
  0.006 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=1.012
pi(a|s=8):
  0.007 - left
  0.038 - down
* 0.949 - right
  0.006 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=1.121
pi(a|s=9):
  0.022 - left
  0.195 - down
* 0.773 - right
  0.009 - up

  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG

V(s)=1.257
pi(a|s=10):
  0.009 - left
* 0.944 - down
  0.042 - right
  0.005 - up

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.385
pi(a|s=14):
  0.009 - left
  0.080 - down
* 0.906 - right
  0.005 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# Value-based control: MonteCarloQ

In [2]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearQ, LinearV
from keras_gym.algorithms import MonteCarloQ, NStepBootstrapV
from keras_gym.policies import ValueBasedPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# value-based policy
Q = LinearQ(env, lr=0.1)
policy = ValueBasedPolicy(Q, boltzmann_temperature=0.1)
algo = MonteCarloQ(Q)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrapV(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            algo_v.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.683
pi(a|s=0):
  0.109 - left
* 0.818 - down
  0.043 - right
  0.030 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.765
pi(a|s=4):
  0.016 - left
* 0.950 - down
  0.014 - right
  0.021 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=0.988
pi(a|s=8):
  0.002 - left
  0.009 - down
* 0.948 - right
  0.041 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=1.114
pi(a|s=9):
  0.022 - left
  0.005 - down
* 0.964 - right
  0.009 - up

  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG

V(s)=1.267
pi(a|s=10):
  0.004 - left
* 0.988 - down
  0.003 - right
  0.005 - up

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.403
pi(a|s=14):
  0.001 - left
  0.004 - down
* 0.993 - right
  0.002 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# Actor-Critic with MonteCarloV

In [3]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearV
from keras_gym.algorithms import MonteCarloV, ValueTD0, NStepBootstrapV
from keras_gym.policies import LinearSoftmaxPolicy, GenericActorCritic


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# value-based policy
ac = GenericActorCritic(
    policy=LinearSoftmaxPolicy(env),
    value_function=LinearV(env))
# ac = LinearSoftmaxActorCritic(env, lr=0.1)
# algo = ValueTD0(ac)
algo = MonteCarloV(ac)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = ac.policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(ac.value_function(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = ac.policy.thompson(s) if update else ac.policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.01  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=-0.028
pi(a|s=0):
  0.199 - left
  0.298 - down
* 0.300 - right
  0.203 - up

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

V(s)=-0.006
pi(a|s=1):
  0.207 - left
  0.289 - down
* 0.295 - right
  0.209 - up

  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG

V(s)=0.041
pi(a|s=2):
  0.207 - left
* 0.298 - down
  0.289 - right
  0.207 - up

  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG

V(s)=0.097
pi(a|s=6):
  0.206 - left
* 0.297 - down
  0.287 - right
  0.210 - up

  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG

V(s)=0.166
pi(a|s=10):
  0.205 - left
* 0.303 - down
  0.284 - right
  0.208 - up

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=0.361
pi(a|s=14):
  0.201 - left
  0.281 - down
* 0.313 - right
  0.205 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
