# Table Of Contents

- [Q-learning](#Q-learning)
- [SARSA](#SARSA)
- [SARSA with type-II model](#SARSA-with-type-II-model)
- [Expected-SARSA](#Expected-SARSA)


# Q-learning

In [1]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import QLearning
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.1, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = QLearning(Q, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s = s_next

        
    if last_episode:
        break


env.close()

  result = entry_point.load(False)


failed after 22 steps
failed after 13 steps
failed after 10 steps
failed after 59 steps
failed after 41 steps
failed after 10 steps
failed after 19 steps
failed after 19 steps
failed after 10 steps
failed after 8 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 10 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 9 steps
failed after 9 steps
failed after 34 steps
failed after 11 steps
failed after 40 steps
failed after 44 steps
failed after 161 steps
failed after 155 steps
failed after 89 steps
failed after 171 steps
failed after 137 steps
failed after 109 steps
failed after 164 steps
failed after 163 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
failed after 82 steps
failed after 117 steps
failed after 195 steps
failed after 154 steps
failed after 178 steps
failed after 178 steps
failed after 143 steps
failed after 195 steps
failed after 91 steps
failed after 174 steps
failed after 166 steps
failed 

# SARSA

In [2]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import Sarsa
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.05, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = Sarsa(Q, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    a = env.action_space.sample()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        s_next, r, done, info = env.step(a)
        a_next = policy.epsilon_greedy(s, epsilon)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, a_next)         
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s, a = s_next, a_next

        
    if last_episode:
        break


env.close()

  result = entry_point.load(False)


failed after 88 steps
failed after 10 steps
failed after 14 steps
failed after 16 steps
failed after 14 steps
failed after 14 steps
failed after 11 steps
failed after 50 steps
failed after 69 steps
failed after 88 steps
failed after 95 steps
failed after 46 steps
failed after 119 steps
failed after 100 steps
failed after 90 steps
num_consecutive_successes = 1
failed after 160 steps
failed after 186 steps
failed after 125 steps
failed after 129 steps
failed after 82 steps
failed after 119 steps
failed after 135 steps
failed after 67 steps
failed after 160 steps
failed after 147 steps
failed after 156 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
failed after 155 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
failed after 129 steps
num_consecutive_successes = 1
failed after 103 steps
num_consecutive_successes = 1
failed after 199 steps
num_consecutive_successes = 1
failed after 185 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
nu

In [3]:
# # record video
# env = gym.make('CartPole-v1')
# env = gym.wrappers.Monitor(env, os.path.join('data', 'video', 'cartpole-linear-model-sarsa'), force=True)
# s = env.reset()
# env.render()
# done = False

# while not done:
#     a = policy.greedy(s)
#     s, _, done, _ = env.step(a)
#     env.render()
    
# env.close()

# SARSA with type-II model

In [4]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import Sarsa
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.5, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)
# transformer = None


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer, model_type=2)
policy = ValuePolicy(Q)
algo = Sarsa(Q, gamma=0.8)


# number of iterations
num_episodes = 1000
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    a = env.action_space.sample()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        s_next, r, done, info = env.step(a)
        a_next = policy.epsilon_greedy(s, epsilon)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, a_next)         
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s, a = s_next, a_next

        
    if last_episode:
        break


env.close()

failed after 19 steps
failed after 14 steps
failed after 15 steps


  result = entry_point.load(False)


failed after 10 steps
failed after 21 steps
failed after 9 steps
failed after 76 steps
failed after 12 steps
failed after 15 steps
failed after 12 steps
failed after 19 steps
failed after 12 steps
failed after 9 steps
failed after 10 steps
failed after 17 steps
failed after 14 steps
failed after 8 steps
failed after 11 steps
failed after 12 steps
failed after 10 steps
failed after 10 steps
failed after 17 steps
failed after 13 steps
failed after 10 steps
failed after 12 steps
failed after 12 steps
failed after 10 steps
failed after 10 steps
failed after 9 steps
failed after 10 steps
failed after 13 steps
failed after 11 steps
failed after 9 steps
failed after 11 steps
failed after 13 steps
failed after 8 steps
failed after 14 steps
failed after 10 steps
failed after 38 steps
failed after 30 steps
failed after 22 steps
failed after 11 steps
failed after 10 steps
failed after 15 steps
failed after 27 steps
failed after 12 steps
failed after 9 steps
failed after 12 steps
failed after 14 s

failed after 9 steps
failed after 8 steps
failed after 10 steps
failed after 11 steps
failed after 9 steps
failed after 11 steps
failed after 12 steps
failed after 9 steps
failed after 11 steps
failed after 11 steps
failed after 12 steps
failed after 41 steps
failed after 29 steps
failed after 14 steps
failed after 10 steps
failed after 11 steps
failed after 11 steps
failed after 9 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 15 steps
failed after 10 steps
failed after 9 steps
failed after 14 steps
failed after 13 steps
failed after 10 steps
failed after 9 steps
failed after 15 steps
failed after 61 steps
failed after 9 steps
failed after 13 steps
failed after 8 steps
failed after 10 steps
failed after 12 steps
failed after 12 steps
failed after 10 steps
failed after 12 steps
failed after 12 steps
failed after 9 steps
failed after 10 steps
failed after 9 steps
failed after 13 steps
failed after 32 steps
failed after 29 steps
failed after 11 steps
f

failed after 11 steps
failed after 32 steps
failed after 12 steps
failed after 12 steps
failed after 27 steps
failed after 32 steps
num_consecutive_successes = 1
failed after 13 steps
failed after 11 steps
failed after 11 steps
failed after 14 steps
failed after 11 steps
failed after 10 steps
failed after 11 steps
failed after 11 steps
failed after 12 steps
failed after 12 steps
failed after 8 steps
failed after 11 steps
failed after 11 steps
failed after 10 steps
failed after 11 steps
failed after 11 steps
failed after 9 steps
failed after 13 steps
failed after 12 steps
failed after 9 steps
failed after 10 steps
failed after 12 steps
failed after 13 steps
failed after 10 steps
failed after 10 steps
failed after 14 steps
failed after 11 steps
failed after 13 steps
failed after 11 steps
failed after 11 steps
failed after 10 steps
failed after 12 steps
failed after 10 steps
failed after 9 steps
failed after 13 steps
failed after 14 steps
failed after 11 steps
failed after 36 steps
failed

# Expected SARSA

In [5]:
import gym
from skgym.value_functions import GenericQ
from skgym.policies import ValuePolicy
from skgym.algorithms import ExpectedSarsa
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer


# the Gym environment
env = gym.make('CartPole-v0')


# define sklearn model for approximating Q-function
regressor = SGDRegressor(eta0=0.1, learning_rate='constant')
transformer = FunctionTransformer(
    lambda x: np.hstack((x, x ** 2)), validate=False)


# define Q, its induced policy and update algorithm
Q = GenericQ(env, regressor, transformer)
policy = ValuePolicy(Q)
algo = ExpectedSarsa(Q, policy, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s = s_next

        
    if last_episode:
        break


env.close()

  result = entry_point.load(False)


failed after 9 steps
failed after 11 steps
failed after 8 steps
failed after 20 steps
failed after 10 steps
failed after 12 steps
failed after 14 steps
failed after 11 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 9 steps
failed after 9 steps
failed after 8 steps
failed after 9 steps
failed after 10 steps
failed after 9 steps
failed after 10 steps
failed after 10 steps
failed after 10 steps
failed after 10 steps
failed after 16 steps
failed after 19 steps
failed after 8 steps
failed after 22 steps
failed after 9 steps
failed after 8 steps
failed after 8 steps
failed after 45 steps
failed after 9 steps
failed after 9 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 13 steps
failed after 8 steps
failed after 8 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 9 steps
failed after 9 steps
failed after 9 steps
failed after 10 steps
failed after 8 steps
failed after 21 steps
failed after 