# FrozenLake (non-slippery) with Monte Carlo methods

- [REINFORCE (Monte Carlo policy gradient)](REINFORCE-(Monte-Carlo-policy-gradient))
- [Value-based Monte Carlo](#Value-based-Monte-Carlo)

# REINFORCE (Monte Carlo policy gradient)

In [21]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearQ, LinearV
from keras_gym.algorithms import Reinforce, NStepBootstrapV
from keras_gym.policies import LinearSoftmaxPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# softmax policy
policy = LinearSoftmaxPolicy(env, lr=0.1)
algo = Reinforce(policy)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrapV(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, done)
            algo_v.update(s, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.723
pi(a|s=0):
  0.002 - left
* 0.930 - down
  0.063 - right
  0.004 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.809
pi(a|s=4):
  0.003 - left
* 0.946 - down
  0.045 - right
  0.006 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=0.994
pi(a|s=8):
  0.004 - left
  0.038 - down
* 0.953 - right
  0.006 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=1.125
pi(a|s=9):
  0.005 - left
* 0.868 - down
  0.120 - right
  0.007 - up

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

V(s)=1.253
pi(a|s=13):
  0.005 - left
  0.106 - down
* 0.878 - right
  0.011 - up

  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.384
pi(a|s=14):
  0.004 - left
  0.135 - down
* 0.855 - right
  0.006 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# Value-based Monte Carlo

In [2]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearQ, LinearV
from keras_gym.algorithms import MonteCarloQ, NStepBootstrapV
from keras_gym.policies import ValueBasedPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# value-based policy
Q = LinearQ(env, lr=0.1)
policy = ValueBasedPolicy(Q, boltzmann_temperature=0.1)
algo = MonteCarloQ(Q)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrapV(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, done)
            algo_v.update(s, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.585
pi(a|s=0):
  0.066 - left
  0.075 - down
* 0.725 - right
  0.134 - up

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

V(s)=0.681
pi(a|s=1):
  0.040 - left
  0.007 - down
* 0.874 - right
  0.079 - up

  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG

V(s)=0.822
pi(a|s=2):
  0.114 - left
* 0.764 - down
  0.043 - right
  0.078 - up

  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG

V(s)=0.972
pi(a|s=6):
  0.003 - left
* 0.838 - down
  0.004 - right
  0.154 - up

  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG

V(s)=1.093
pi(a|s=10):
  0.007 - left
* 0.826 - down
  0.007 - right
  0.160 - up

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.296
pi(a|s=14):
  0.002 - left
  0.051 - down
* 0.936 - right
  0.010 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
