# FrozenLake (non-slippery) with Monte Carlo methods

- [REINFORCE (Monte Carlo policy gradient)](REINFORCE-(Monte-Carlo-policy-gradient))
- [Value-based control: MonteCarloQ](#Value-based-control:-MonteCarloQ)
- [Advantage Actor-Critic](#Advantage-Actor-Critic)

# REINFORCE (Monte Carlo policy gradient)

In [1]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearV
from keras_gym.algorithms import Reinforce, NStepBootstrap
from keras_gym.policies import LinearSoftmaxPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# softmax policy
policy = LinearSoftmaxPolicy(env, lr=0.1)
algo = Reinforce(policy)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrap(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            algo_v.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.565
pi(a|s=0):
  0.004 - left
* 0.942 - down
  0.050 - right
  0.004 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.631
pi(a|s=4):
  0.005 - left
* 0.944 - down
  0.045 - right
  0.006 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=0.795
pi(a|s=8):
  0.006 - left
  0.040 - down
* 0.948 - right
  0.006 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=0.880
pi(a|s=9):
  0.006 - left
* 0.931 - down
  0.057 - right
  0.006 - up

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

V(s)=1.048
pi(a|s=13):
  0.006 - left
  0.111 - down
* 0.874 - right
  0.009 - up

  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.268
pi(a|s=14):
  0.005 - left
  0.059 - down
* 0.930 - right
  0.006 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# Value-based control: MonteCarloQ

In [2]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearQ, LinearV
from keras_gym.algorithms import MonteCarloQ, NStepBootstrap
from keras_gym.policies import ValueBasedPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# value-based policy
Q = LinearQ(env, lr=0.1)
policy = ValueBasedPolicy(Q, boltzmann_temperature=0.1)
algo = MonteCarloQ(Q)

# value function
V = LinearV(env, lr=0.1)
algo_v = NStepBootstrap(V, n=10, experience_cache_size=100000)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(V(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = policy.thompson(s) if update else policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.1  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            algo_v.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.594
pi(a|s=0):
  0.187 - left
* 0.443 - down
  0.307 - right
  0.063 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.586
pi(a|s=4):
  0.076 - left
* 0.799 - down
  0.011 - right
  0.113 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=0.727
pi(a|s=8):
  0.042 - left
  0.033 - down
* 0.825 - right
  0.100 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=0.999
pi(a|s=9):
  0.023 - left
* 0.649 - down
  0.324 - right
  0.004 - up

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

V(s)=1.091
pi(a|s=13):
  0.006 - left
  0.033 - down
* 0.940 - right
  0.020 - up

  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=1.226
pi(a|s=14):
  0.023 - left
  0.055 - down
* 0.896 - right
  0.026 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# Advantage Actor-Critic

In [3]:
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.utils import reload_all
reload_all()

from keras_gym.value_functions import LinearV
from keras_gym.algorithms import MonteCarloV, NStepBootstrap, AdvantageActorCritic
from keras_gym.policies import LinearSoftmaxPolicy


# non-slippery version of FrozenLake
env = FrozenLakeEnv(is_slippery=False)
actions = {UP: 'up', DOWN: 'down', LEFT: 'left', RIGHT: 'right'}


# value-based policy
ac = AdvantageActorCritic(
    policy=LinearSoftmaxPolicy(env, lr=0.01),
    value_function=LinearV(env, lr=0.1))
algo = NStepBootstrap(ac, n=200)


def display_proba(s):
    """ yes, this function is horrendous """
    proba = ac.policy.proba(s).p
    pmax = np.max(proba)
    print(
        '\nV(s)={:.3f}'.format(ac.value_function(s)) +
        '\npi(a|s={}):\n'.format(s) +
        '\n'.join("{2} {1:.3f} - {0}".format(
            actions[a], p, '*' if p == pmax else ' ')
            for a, p in enumerate(proba)) + '\n')



def run_episode(update=False, render=False):
    s = env.reset()
    max_steps = 100
    for t in range(1, max_steps + 1):
        if render:
            env.render()
            display_proba(s)
        
        # draw action and take a step
        a = ac.policy.thompson(s) if update else ac.policy.greedy(s)                       
        s_next, r, done, info = env.step(a)
        if s_next == s:
            r = -0.01  # small incentive to keep moving

        if update:
            algo.update(s, a, r, s_next, done)
            
        if done:
            break

        # prepare for next step
        s = s_next

    if render:
        env.render()


for _ in range(500):
    run_episode(update=True)

run_episode(render=True)


[41mS[0mFFF
FHFH
FFFH
HFFG

V(s)=0.016
pi(a|s=0):
  0.213 - left
* 0.291 - down
  0.290 - right
  0.206 - up

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

V(s)=0.121
pi(a|s=4):
  0.215 - left
* 0.288 - down
  0.286 - right
  0.211 - up

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG

V(s)=0.187
pi(a|s=8):
  0.219 - left
  0.277 - down
* 0.293 - right
  0.212 - up

  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

V(s)=0.249
pi(a|s=9):
  0.216 - left
  0.284 - down
* 0.290 - right
  0.210 - up

  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG

V(s)=0.286
pi(a|s=10):
  0.218 - left
* 0.292 - down
  0.281 - right
  0.209 - up

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG

V(s)=0.686
pi(a|s=14):
  0.215 - left
  0.277 - down
* 0.303 - right
  0.205 - up

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
