# Reinforcement Learning Control

In [79]:
import numpy as np
import gym
from time import sleep
from IPython.display import clear_output

# helper functions from a local file
from utils.helper import epsilon_greedy
from utils.helper import generate_episode
from utils.helper import linear_decay
from utils.helper import print_policy, print_state_value_func

## Moncte Carlo First Visit Control

In [18]:
def mc_fv_control(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=10000):
    '''
    Monte Carlo First Visit control function. 
    The function calculates an estimation of the optimal Q-function, V-function and policy

    Args: 
        env:            OpenAI gym environment to interract with
        eps_start:      The starting value for epsilon using epsilon greedy
        eps_end:        The final value for epsilon using epsilon greedy
        alpha_start:    The starting value for the learning rate
        aplpha_end:     The final value for the learning rate
        decay_duration: The number of episodes epsilon and alpha are decayed
        gamma:          Discounting rate for the bellman equation
        num_episodes:   The number of episodes to interract with the environment

    Returns:
        Q:              The estimate of the optimal action value function
        V:              The estimate of the optimal state value function
        pi:             The estimate of an optimal strategy
    '''
    
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    # generates a policy based on the current Q and epsilon 
    def create_pi(Q, epsilon):
        def pi(state):
            return epsilon_greedy(Q, state, epsilon)
        return pi
    
    for episode in range(num_episodes):
        visited = np.zeros_like(Q, dtype=np.bool)
        
        
        pi = create_pi(Q, epsilons[episode])
        generated_episode = generate_episode(env, pi)
        
        for t, (obs, action, _, _, _) in enumerate(generated_episode):
            if visited[obs][action] == True:
                continue
            visited[obs][action] = True
            
            remaining_rewards = np.array(generated_episode, dtype=np.object)[t:, 2]
            remaining_steps = len(remaining_rewards)
            
            # calculate discounting
            bases = [gamma for _ in range(remaining_steps)]
            exponents = [x for x in range(remaining_steps)]
            discounts = np.power(bases, exponents)
            
            #calculate discounted return
            mc_target = np.sum(remaining_rewards * discounts)
            
            
            Q[obs][action] = Q[obs][action] + alphas[episode] * (mc_target - Q[obs][action])
            
    V = np.max(Q, axis=1)
    pi = lambda s: {state: action for state, action in enumerate(np.argmax(Q, axis=1))}[s]
    
    return Q, V, pi

## SARSA (TD On Policy learning)

In [23]:
def sarsa(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=10000):
    '''
    SARSA control function (On-Policy TD). 
    The function calculates an estimation of the optimal Q-function, V-function and policy

    Args: 
        env:            OpenAI gym environment to interract with
        eps_start:      The starting value for epsilon using epsilon greedy
        eps_end:        The final value for epsilon using epsilon greedy
        alpha_start:    The starting value for the learning rate
        aplpha_end:     The final value for the learning rate
        decay_duration: The number of episodes epsilon and alpha are decayed
        gamma:          Discounting rate for the bellman equation
        num_episodes:   The number of episodes to interract with the environment

    Returns:
        Q:              The estimate of the optimal action value function
        V:              The estimate of the optimal state value function
        pi:             The estimate of an optimal strategy
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        action = epsilon_greedy(Q, obs, epsilons[episode])
        while not done:
            next_obs, reward, done, _ = env.step(action)
            next_action = epsilon_greedy(Q, next_obs, epsilons[episode])
            sarsa_target = reward + gamma * Q[next_obs][next_action] * (not done)
            Q[obs][action] = Q[obs][action] + alphas[episode] * (sarsa_target - Q[obs][action])
            obs, action = next_obs, next_action
    
    V = np.max(Q, axis=1)
    pi = lambda s: {state: action for state, action in enumerate(np.argmax(Q, axis=1))}[s]
    
    return Q, V, pi

## Q-Learning (TD Off Policy learning)

In [29]:
def q_learning(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=10000):
    '''
    Q-Learning control function (Off-Policy TD). 
    The function calculates an estimation of the optimal Q-function, V-function and policy

    Args: 
        env:            OpenAI gym environment to interract with
        eps_start:      The starting value for epsilon using epsilon greedy
        eps_end:        The final value for epsilon using epsilon greedy
        alpha_start:    The starting value for the learning rate
        aplpha_end:     The final value for the learning rate
        decay_duration: The number of episodes epsilon and alpha are decayed
        gamma:          Discounting rate for the bellman equation
        num_episodes:   The number of episodes to interract with the environment

    Returns:
        Q:              The estimate of the optimal action value function
        V:              The estimate of the optimal state value function
        pi:             The estimate of an optimal strategy
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        while not done:
            action = epsilon_greedy(Q, obs, epsilons[episode])
            next_obs, reward, done, _ = env.step(action)
            next_action = np.argmax(Q[next_obs])
            sarsa_target = reward + gamma * Q[next_obs][next_action] * (not done)
            Q[obs][action] = Q[obs][action] + alphas[episode] * (sarsa_target - Q[obs][action])
            obs = next_obs
            
    V = np.max(Q, axis=1)
    pi = lambda s: {state: action for state, action in enumerate(np.argmax(Q, axis=1))}[s]
    
    return Q, V, pi

## Double Q-Learning (reducing maximization bias)

In [35]:
def double_q_learning(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=10000):
    '''
    Double Q-Learning control function (Off-Policy TD). 
    The function calculates an estimation of the optimal Q-function, V-function and policy

    Args: 
        env:            OpenAI gym environment to interract with
        eps_start:      The starting value for epsilon using epsilon greedy
        eps_end:        The final value for epsilon using epsilon greedy
        alpha_start:    The starting value for the learning rate
        aplpha_end:     The final value for the learning rate
        decay_duration: The number of episodes epsilon and alpha are decayed
        gamma:          Discounting rate for the bellman equation
        num_episodes:   The number of episodes to interract with the environment

    Returns:
        Q:              The estimate of the optimal action value function
        V:              The estimate of the optimal state value function
        pi:             The estimate of an optimal strategy
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
 
    Q_1 = np.zeros((nS, nA), dtype=np.float32)
    Q_2 = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        while not done:
            action = epsilon_greedy((Q_1 +  Q_2) / 2, obs, epsilons[episode])
            next_obs, reward, done, _ = env.step(action)
            if np.random.random() < 0.5:
                next_action = np.argmax(Q_1[next_obs])
                sarsa_target = reward + gamma * Q_2[next_obs][next_action] * (not done)
                Q_1[obs][action] = Q_1[obs][action] + alphas[episode] * (sarsa_target - Q_1[obs][action])
                obs = next_obs
            else:
                next_action = np.argmax(Q_2[next_obs])
                sarsa_target = reward + gamma * Q_1[next_obs][next_action] * (not done)
                Q_2[obs][action] = Q_2[obs][action] + alphas[episode] * (sarsa_target - Q_2[obs][action])
                obs = next_obs
                
    Q = (Q_1 + Q_2) / 2
    V = np.max(Q, axis=1)
    pi = lambda s: {state: action for state, action in enumerate(np.argmax(Q, axis=1))}[s]
    
    return Q, V, pi

## Frozen Lake

In [6]:
env = gym.make('FrozenLake-v0')

In [7]:
def value_iteration(env, gamma=0.99, delta=1e-10):
    '''
    Finds an optimal policy
    
    Args: 
        env:     openai gym environment
        gamma:   discount factor
        delta:   threshhold value to interrupt the policy evaluation
    
    Returns: 
        Optimal policy and value function
    ''' 
    nS = env.observation_space.n
    nA = env.action_space.n
    V = np.zeros(nS, dtype=np.float64)
    P = env.env.P
    
    while True:
        V_old = V.copy()
        Q = np.zeros(shape=(nS, nA), dtype=np.float64)
        for state in range(nS):
            for action in range(nA):
                for pr, next_state, reward, done in P[state][action]:
                    Q[state][action] += pr * (reward + gamma * V[next_state] * (not done))
        V = np.max(Q, axis=1)
        max_diff = np.max(np.abs(V_old - V))
        if max_diff < delta:
            break

    strategy = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
    def pi(s):
        return strategy[s]
    
    return pi, V

In [46]:
pi, V = value_iteration(env, gamma=0.99)

In [47]:
print_policy(pi, 16, 4, name='FrozenLake optimal policy')



[1mFrozenLake optimal policy[0m


         ←          ↑          ↑          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [48]:
print_state_value_func(V, 4, name='FrozenLake optimal value function')



[1mFrozenLake optimal value function[0m


0.54203 0.49880 0.47070 0.45685
0.55845 0.00000 0.35835 0.00000
0.59180 0.64308 0.61521 0.00000
0.00000 0.74172 0.86284 0.00000


### Monte Carlo Frozen Lake control

In [10]:
#parameters
num_episodes = 10000
decay_duration = 5000

In [53]:
Q, V, pi = mc_fv_control(env, eps_start=1.0, eps_end=0.01, alpha_start=0.05, alpha_end=0.01, \
                     decay_duration=decay_duration, num_episodes=num_episodes)

In [56]:
print_policy(pi, 16, 4, name='FrozenLake mc optimal policy estimation')



[1mFrozenLake mc optimal policy estimation[0m


         ↓          ↑          ↓          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [57]:
print_state_value_func(V, 4, name='FrozenLake mc optimal value function estimation')



[1mFrozenLake mc optimal value function estimation[0m


0.37898 0.32744 0.28837 0.26887
0.41985 0.00000 0.27365 0.00000
0.49916 0.56090 0.55292 0.00000
0.00000 0.67714 0.82830 0.00000


### SARSA Frozen Lake control

In [60]:
Q, V, pi = sarsa(env, eps_start=1.0, eps_end=0.01, alpha_start=0.05, alpha_end=0.01, \
             decay_duration=decay_duration, num_episodes=num_episodes)

In [61]:
print_policy(pi, 16, 4, name='FrozenLake SARSA optimal policy estimation')



[1mFrozenLake mc optimal policy estimation[0m


         ←          ↑          ←          ↑
         ←          ■          ←          ■
         ↑          ↓          ↓          ■
         ■          →          ↓          ■


In [63]:
print_state_value_func(V, 4, name='FrozenLake SARSA optimal value function estimation')



[1mFrozenLake SARSA optimal value function estimation[0m


0.39224 0.24000 0.13246 0.01958
0.40921 0.00000 0.13352 0.00000
0.44517 0.50703 0.43842 0.00000
0.00000 0.64681 0.81653 0.00000


### Q-Learning Frozen Lake control

In [66]:
Q, V, pi = q_learning(env, eps_start=1.0, eps_end=0.01, alpha_start=0.5, alpha_end=0.01, \
                  decay_duration=decay_duration, num_episodes=num_episodes)

In [67]:
print_policy(pi, 16, 4, name='FrozenLake Q-Learning optimal policy estimation')



[1mFrozenLake Q-Learning optimal policy estimation[0m


         ←          ↑          ↑          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [68]:
print_state_value_func(V, 4, name='FrozenLake Q-Learning optimal value function estimation')



[1mFrozenLake Q-Learning optimal value function estimation[0m


0.49982 0.44215 0.41037 0.39833
0.51633 0.00000 0.31295 0.00000
0.54115 0.60409 0.60304 0.00000
0.00000 0.71231 0.84676 0.00000


### Double Q-Learning Frozen Lake control

In [69]:
Q, V, pi = double_q_learning(env, eps_start=1.0, eps_end=0.01, alpha_start=0.5, alpha_end=0.01, \
                         decay_duration=decay_duration, num_episodes=num_episodes)

In [70]:
print_policy(pi, 16, 4, name='FrozenLake Double Q-Learning optimal policy estimation')



[1mFrozenLake Double Q-Learning optimal policy estimation[0m


         ←          ↑          ←          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [71]:
print_state_value_func(V, 4, name='FrozenLake Double Q-Learning optimal value function estimation')



[1mFrozenLake Double Q-Learning optimal value function estimation[0m


0.50423 0.43319 0.36527 0.23953
0.52129 0.00000 0.30889 0.00000
0.55442 0.60340 0.57968 0.00000
0.00000 0.72071 0.83441 0.00000


## Taxi environment with Q Learning

In [73]:
env = gym.make('Taxi-v3')

In [74]:
Q, V, pi = q_learning(env, eps_start=1.0, eps_end=0.01, alpha_start=0.5, alpha_end=0.01, \
                  decay_duration=decay_duration, num_episodes=num_episodes)

In [88]:
# Play an episode of taxi
obs, done = env.reset(), False
env.render()
while not done:
    sleep(0.5)
    clear_output(wait=True)
    action = pi(obs)
    next_obs, reward, done, _ = env.step(action)
    obs = next_obs
    env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
