# Table Of Contents

- [N-Step Q-learning](#N-Step-Q-learning)
- [N-Step SARSA](#N-Step-SARSA)
- [N-Step Expected-SARSA](#N-Step-Expected-SARSA)


# N-Step Q-learning

In [1]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import NStepQLearning
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.5, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = NStepQLearning(Q, n=20, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, done)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s = s_next

        
    if last_episode:
        break


env.close()

failed after 23 steps
failed after 14 steps
failed after 15 steps
failed after 15 steps
failed after 14 steps
failed after 10 steps
failed after 25 steps
failed after 10 steps
failed after 9 steps


  result = entry_point.load(False)


failed after 69 steps
failed after 78 steps
failed after 63 steps
failed after 77 steps
failed after 68 steps
failed after 66 steps
failed after 92 steps
num_consecutive_successes = 1
failed after 169 steps
failed after 92 steps
failed after 45 steps
num_consecutive_successes = 1
failed after 17 steps
failed after 58 steps
failed after 59 steps
failed after 58 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
failed after 104 steps
failed after 157 steps
failed after 66 steps
failed after 154 steps
num_consecutive_successes = 1
failed after 180 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutive_successes = 5
num_consecutive_successes = 6
num_consecutive_successes = 7
num_consecutive_successes = 8
failed after 131 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutiv

# N-Step SARSA

In [2]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import NStepSarsa
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.5, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = NStepSarsa(Q, n=20, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    a = policy.random()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        s_next, r, done, info = env.step(a)
        a_next = policy.epsilon_greedy(s_next, epsilon)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, a_next, done)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s, a = s_next, a_next

        
    if last_episode:
        break


env.close()

  result = entry_point.load(False)


failed after 15 steps
failed after 45 steps
failed after 32 steps
failed after 11 steps
failed after 11 steps
failed after 25 steps
failed after 19 steps
failed after 15 steps
failed after 22 steps
failed after 9 steps
failed after 31 steps
failed after 9 steps
failed after 88 steps
failed after 128 steps
failed after 130 steps
failed after 106 steps
failed after 12 steps
failed after 12 steps
failed after 153 steps
failed after 112 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutive_successes = 5
num_consecutive_successes = 6
failed after 145 steps
failed after 92 steps
failed after 11 steps
failed after 9 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
failed after 188 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
failed after 13 steps
failed after 9 steps
failed after 35 steps
failed after 10 steps

# N-Step Expected-SARSA

In [3]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import NStepExpectedSarsa
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.5, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = NStepExpectedSarsa(Q, policy, n=20, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, done)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s = s_next

        
    if last_episode:
        break


env.close()

failed after 12 steps
failed after 9 steps
failed after 22 steps
failed after 10 steps
failed after 37 steps
failed after 16 steps
failed after 22 steps
failed after 15 steps
failed after 22 steps


  result = entry_point.load(False)


failed after 76 steps
failed after 10 steps
failed after 10 steps
failed after 9 steps
failed after 37 steps
failed after 114 steps
failed after 51 steps
failed after 38 steps
failed after 10 steps
failed after 32 steps
failed after 10 steps
failed after 46 steps
failed after 38 steps
failed after 19 steps
failed after 21 steps
failed after 8 steps
failed after 18 steps
failed after 10 steps
failed after 23 steps
failed after 27 steps
failed after 8 steps
failed after 38 steps
failed after 35 steps
failed after 31 steps
failed after 10 steps
failed after 130 steps
failed after 71 steps
failed after 82 steps
failed after 79 steps
failed after 69 steps
failed after 66 steps
failed after 127 steps
failed after 48 steps
failed after 94 steps
num_consecutive_successes = 1
failed after 88 steps
failed after 11 steps
failed after 133 steps
failed after 10 steps
failed after 90 steps
failed after 67 steps
failed after 40 steps
failed after 32 steps
failed after 174 steps
failed after 192 steps